Blame - source/test/intltest/convtest.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 5ca063485a2445fbb70bda076a910c97237e2f05 [file] [log] [blame]

Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	*******************************************************************************
				5	*
				6	* Copyright (C) 2003-2014, International Business Machines
				7	* Corporation and others. All Rights Reserved.
				8	*
				9	*******************************************************************************
				10	* file name: convtest.cpp
				11	* encoding: UTF-8
				12	* tab size: 8 (not used)
				13	* indentation:4
				14	*
				15	* created on: 2003jul15
				16	* created by: Markus W. Scherer
				17	*
				18	* Test file for data-driven conversion tests.
				19	*/
				20
				21	#include "unicode/utypes.h"
				22
				23	#if !UCONFIG_NO_LEGACY_CONVERSION
				24	/*
				25	* Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
				26	* is slightly unnecessary - it removes tests for Unicode charsets
				27	* like UTF-8 that should work.
				28	* However, there is no easy way for the test to detect whether a test case
				29	* is for a Unicode charset, so it would be difficult to only exclude those.
				30	* Also, regular testing of ICU is done with all modules on, therefore
				31	* not testing conversion for a custom configuration like this should be ok.
				32	*/
				33
				34	#include "unicode/ucnv.h"
				35	#include "unicode/unistr.h"
				36	#include "unicode/parsepos.h"
				37	#include "unicode/uniset.h"
				38	#include "unicode/usetiter.h"
				39	#include "unicode/ustring.h"
				40	#include "unicode/ures.h"
				41	#include "unicode/utf16.h"
				42	#include "convtest.h"
				43	#include "cmemory.h"
				44	#include "unicode/tstdtmod.h"
				45	#include <string.h>
				46	#include <stdlib.h>
				47
				48	enum {
				49	// characters used in test data for callbacks
				50	SUB_CB='?',
				51	SKIP_CB='0',
				52	STOP_CB='.',
				53	ESC_CB='&'
				54	};
				55
				56	ConversionTest::ConversionTest() {
				57	UErrorCode errorCode=U_ZERO_ERROR;
				58	utf8Cnv=ucnv_open("UTF-8", &errorCode);
				59	ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
				60	if(U_FAILURE(errorCode)) {
				61	errln("unable to open UTF-8 converter");
				62	}
				63	}
				64
				65	ConversionTest::~ConversionTest() {
				66	ucnv_close(utf8Cnv);
				67	}
				68
				69	void
				70	ConversionTest::runIndexedTest(int32_t index, UBool exec, const char &name, char /par/) {
				71	if (exec) logln("TestSuite ConversionTest: ");
				72	TESTCASE_AUTO_BEGIN;
				73	#if !UCONFIG_NO_FILE_IO
				74	TESTCASE_AUTO(TestToUnicode);
				75	TESTCASE_AUTO(TestFromUnicode);
				76	TESTCASE_AUTO(TestGetUnicodeSet);
				77	#endif
				78	TESTCASE_AUTO(TestGetUnicodeSet2);
				79	TESTCASE_AUTO(TestDefaultIgnorableCallback);
				80	TESTCASE_AUTO(TestUTF8ToUTF8Overflow);
				81	TESTCASE_AUTO(TestUTF8ToUTF8Streaming);
				82	TESTCASE_AUTO_END;
				83	}
				84
				85	// test data interface ----------------------------------------------------- ***
				86
				87	void
				88	ConversionTest::TestToUnicode() {
				89	ConversionCase cc;
				90	char charset[100], cbopt[4];
				91	const char *option;
				92	UnicodeString s, unicode;
				93	int32_t offsetsLength;
				94	UConverterToUCallback callback;
				95
				96	TestDataModule *dataModule;
				97	TestData *testData;
				98	const DataMap *testCase;
				99	UErrorCode errorCode;
				100	int32_t i;
				101
				102	errorCode=U_ZERO_ERROR;
				103	dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
				104	if(U_SUCCESS(errorCode)) {
				105	testData=dataModule->createTestData("toUnicode", errorCode);
				106	if(U_SUCCESS(errorCode)) {
				107	for(i=0; testData->nextCase(testCase, errorCode); ++i) {
				108	if(U_FAILURE(errorCode)) {
				109	errln("error retrieving conversion/toUnicode test case %d - %s",
				110	i, u_errorName(errorCode));
				111	errorCode=U_ZERO_ERROR;
				112	continue;
				113	}
				114
				115	cc.caseNr=i;
				116
				117	s=testCase->getString("charset", errorCode);
				118	s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
				119	cc.charset=charset;
				120
				121	cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
				122	unicode=testCase->getString("unicode", errorCode);
				123	cc.unicode=unicode.getBuffer();
				124	cc.unicodeLength=unicode.length();
				125
				126	offsetsLength=0;
				127	cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
				128	if(offsetsLength==0) {
				129	cc.offsets=NULL;
				130	} else if(offsetsLength!=unicode.length()) {
				131	errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
				132	i, unicode.length(), offsetsLength);
				133	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
				134	}
				135
				136	cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
				137	cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
				138
				139	s=testCase->getString("errorCode", errorCode);
				140	if(s==UNICODE_STRING("invalid", 7)) {
				141	cc.outErrorCode=U_INVALID_CHAR_FOUND;
				142	} else if(s==UNICODE_STRING("illegal", 7)) {
				143	cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
				144	} else if(s==UNICODE_STRING("truncated", 9)) {
				145	cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
				146	} else if(s==UNICODE_STRING("illesc", 6)) {
				147	cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
				148	} else if(s==UNICODE_STRING("unsuppesc", 9)) {
				149	cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
				150	} else {
				151	cc.outErrorCode=U_ZERO_ERROR;
				152	}
				153
				154	s=testCase->getString("callback", errorCode);
				155	s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
				156	cc.cbopt=cbopt;
				157	switch(cbopt[0]) {
				158	case SUB_CB:
				159	callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
				160	break;
				161	case SKIP_CB:
				162	callback=UCNV_TO_U_CALLBACK_SKIP;
				163	break;
				164	case STOP_CB:
				165	callback=UCNV_TO_U_CALLBACK_STOP;
				166	break;
				167	case ESC_CB:
				168	callback=UCNV_TO_U_CALLBACK_ESCAPE;
				169	break;
				170	default:
				171	callback=NULL;
				172	break;
				173	}
				174	option=callback==NULL ? cbopt : cbopt+1;
				175	if(*option==0) {
				176	option=NULL;
				177	}
				178
				179	cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
				180
				181	if(U_FAILURE(errorCode)) {
				182	errln("error parsing conversion/toUnicode test case %d - %s",
				183	i, u_errorName(errorCode));
				184	errorCode=U_ZERO_ERROR;
				185	} else {
				186	logln("TestToUnicode[%d] %s", i, charset);
				187	ToUnicodeCase(cc, callback, option);
				188	}
				189	}
				190	delete testData;
				191	}
				192	delete dataModule;
				193	}
				194	else {
				195	dataerrln("Could not load test conversion data");
				196	}
				197	}
				198
				199	void
				200	ConversionTest::TestFromUnicode() {
				201	ConversionCase cc;
				202	char charset[100], cbopt[4];
				203	const char *option;
				204	UnicodeString s, unicode, invalidUChars;
				205	int32_t offsetsLength, index;
				206	UConverterFromUCallback callback;
				207
				208	TestDataModule *dataModule;
				209	TestData *testData;
				210	const DataMap *testCase;
				211	const UChar *p;
				212	UErrorCode errorCode;
				213	int32_t i, length;
				214
				215	errorCode=U_ZERO_ERROR;
				216	dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
				217	if(U_SUCCESS(errorCode)) {
				218	testData=dataModule->createTestData("fromUnicode", errorCode);
				219	if(U_SUCCESS(errorCode)) {
				220	for(i=0; testData->nextCase(testCase, errorCode); ++i) {
				221	if(U_FAILURE(errorCode)) {
				222	errln("error retrieving conversion/fromUnicode test case %d - %s",
				223	i, u_errorName(errorCode));
				224	errorCode=U_ZERO_ERROR;
				225	continue;
				226	}
				227
				228	cc.caseNr=i;
				229
				230	s=testCase->getString("charset", errorCode);
				231	s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
				232	cc.charset=charset;
				233
				234	unicode=testCase->getString("unicode", errorCode);
				235	cc.unicode=unicode.getBuffer();
				236	cc.unicodeLength=unicode.length();
				237	cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
				238
				239	offsetsLength=0;
				240	cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
				241	if(offsetsLength==0) {
				242	cc.offsets=NULL;
				243	} else if(offsetsLength!=cc.bytesLength) {
				244	errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
				245	i, cc.bytesLength, offsetsLength);
				246	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
				247	}
				248
				249	cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
				250	cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
				251
				252	s=testCase->getString("errorCode", errorCode);
				253	if(s==UNICODE_STRING("invalid", 7)) {
				254	cc.outErrorCode=U_INVALID_CHAR_FOUND;
				255	} else if(s==UNICODE_STRING("illegal", 7)) {
				256	cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
				257	} else if(s==UNICODE_STRING("truncated", 9)) {
				258	cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
				259	} else {
				260	cc.outErrorCode=U_ZERO_ERROR;
				261	}
				262
				263	s=testCase->getString("callback", errorCode);
				264	cc.setSub=0; // default: no subchar
				265
				266	if((index=s.indexOf((UChar)0))>0) {
				267	// read NUL-separated subchar first, if any
				268	// copy the subchar from Latin-1 characters
				269	// start after the NUL
				270	p=s.getTerminatedBuffer();
				271	length=index+1;
				272	p+=length;
				273	length=s.length()-length;
				274	if(length<=0 \|\| length>=(int32_t)sizeof(cc.subchar)) {
				275	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
				276	} else {
				277	int32_t j;
				278
				279	for(j=0; j<length; ++j) {
				280	cc.subchar[j]=(char)p[j];
				281	}
				282	// NUL-terminate the subchar
				283	cc.subchar[j]=0;
				284	cc.setSub=1;
				285	}
				286
				287	// remove the NUL and subchar from s
				288	s.truncate(index);
				289	} else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
				290	// read a substitution string, separated by an equal sign
				291	p=s.getBuffer()+index+1;
				292	length=s.length()-(index+1);
				293	if(length<0 \|\| length>=UPRV_LENGTHOF(cc.subString)) {
				294	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
				295	} else {
				296	u_memcpy(cc.subString, p, length);
				297	// NUL-terminate the subString
				298	cc.subString[length]=0;
				299	cc.setSub=-1;
				300	}
				301
				302	// remove the equal sign and subString from s
				303	s.truncate(index);
				304	}
				305
				306	s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
				307	cc.cbopt=cbopt;
				308	switch(cbopt[0]) {
				309	case SUB_CB:
				310	callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
				311	break;
				312	case SKIP_CB:
				313	callback=UCNV_FROM_U_CALLBACK_SKIP;
				314	break;
				315	case STOP_CB:
				316	callback=UCNV_FROM_U_CALLBACK_STOP;
				317	break;
				318	case ESC_CB:
				319	callback=UCNV_FROM_U_CALLBACK_ESCAPE;
				320	break;
				321	default:
				322	callback=NULL;
				323	break;
				324	}
				325	option=callback==NULL ? cbopt : cbopt+1;
				326	if(*option==0) {
				327	option=NULL;
				328	}
				329
				330	invalidUChars=testCase->getString("invalidUChars", errorCode);
				331	cc.invalidUChars=invalidUChars.getBuffer();
				332	cc.invalidLength=invalidUChars.length();
				333
				334	if(U_FAILURE(errorCode)) {
				335	errln("error parsing conversion/fromUnicode test case %d - %s",
				336	i, u_errorName(errorCode));
				337	errorCode=U_ZERO_ERROR;
				338	} else {
				339	logln("TestFromUnicode[%d] %s", i, charset);
				340	FromUnicodeCase(cc, callback, option);
				341	}
				342	}
				343	delete testData;
				344	}
				345	delete dataModule;
				346	}
				347	else {
				348	dataerrln("Could not load test conversion data");
				349	}
				350	}
				351
				352	static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
				353
				354	void
				355	ConversionTest::TestGetUnicodeSet() {
				356	char charset[100];
				357	UnicodeString s, map, mapnot;
				358	int32_t which;
				359
				360	ParsePosition pos;
				361	UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
				362	UnicodeSet *cnvSetPtr = &cnvSet;
				363	LocalUConverterPointer cnv;
				364
				365	TestDataModule *dataModule;
				366	TestData *testData;
				367	const DataMap *testCase;
				368	UErrorCode errorCode;
				369	int32_t i;
				370
				371	errorCode=U_ZERO_ERROR;
				372	dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
				373	if(U_SUCCESS(errorCode)) {
				374	testData=dataModule->createTestData("getUnicodeSet", errorCode);
				375	if(U_SUCCESS(errorCode)) {
				376	for(i=0; testData->nextCase(testCase, errorCode); ++i) {
				377	if(U_FAILURE(errorCode)) {
				378	errln("error retrieving conversion/getUnicodeSet test case %d - %s",
				379	i, u_errorName(errorCode));
				380	errorCode=U_ZERO_ERROR;
				381	continue;
				382	}
				383
				384	s=testCase->getString("charset", errorCode);
				385	s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
				386
				387	map=testCase->getString("map", errorCode);
				388	mapnot=testCase->getString("mapnot", errorCode);
				389
				390	which=testCase->getInt28("which", errorCode);
				391
				392	if(U_FAILURE(errorCode)) {
				393	errln("error parsing conversion/getUnicodeSet test case %d - %s",
				394	i, u_errorName(errorCode));
				395	errorCode=U_ZERO_ERROR;
				396	continue;
				397	}
				398
				399	// test this test case
				400	mapSet.clear();
				401	mapnotSet.clear();
				402
				403	pos.setIndex(0);
				404	mapSet.applyPattern(map, pos, 0, NULL, errorCode);
				405	if(U_FAILURE(errorCode) \|\| pos.getIndex()!=map.length()) {
				406	errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
				407	" error index %d index %d U+%04x",
				408	i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
				409	errorCode=U_ZERO_ERROR;
				410	continue;
				411	}
				412
				413	pos.setIndex(0);
				414	mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
				415	if(U_FAILURE(errorCode) \|\| pos.getIndex()!=mapnot.length()) {
				416	errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
				417	" error index %d index %d U+%04x",
				418	i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
				419	errorCode=U_ZERO_ERROR;
				420	continue;
				421	}
				422
				423	logln("TestGetUnicodeSet[%d] %s", i, charset);
				424
				425	cnv.adoptInstead(cnv_open(charset, errorCode));
				426	if(U_FAILURE(errorCode)) {
				427	errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
				428	charset, i, u_errorName(errorCode));
				429	errorCode=U_ZERO_ERROR;
				430	continue;
				431	}
				432
				433	ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode);
				434
				435	if(U_FAILURE(errorCode)) {
				436	errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
				437	charset, i, u_errorName(errorCode));
				438	errorCode=U_ZERO_ERROR;
				439	continue;
				440	}
				441
				442	// are there items that must be in cnvSet but are not?
				443	(diffSet=mapSet).removeAll(cnvSet);
				444	if(!diffSet.isEmpty()) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	445	diffSet.toPattern(s, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	446	if(s.length()>100) {
				447	s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
				448	}
				449	errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
				450	charset, i);
				451	errln(s);
				452	}
				453
				454	// are there items that must not be in cnvSet but are?
				455	(diffSet=mapnotSet).retainAll(cnvSet);
				456	if(!diffSet.isEmpty()) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	457	diffSet.toPattern(s, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	458	if(s.length()>100) {
				459	s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
				460	}
				461	errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
				462	charset, i);
				463	errln(s);
				464	}
				465	}
				466	delete testData;
				467	}
				468	delete dataModule;
				469	}
				470	else {
				471	dataerrln("Could not load test conversion data");
				472	}
				473	}
				474
				475	U_CDECL_BEGIN
				476	static void U_CALLCONV
				477	getUnicodeSetCallback(const void *context,
				478	UConverterFromUnicodeArgs * /fromUArgs/,
				479	const UChar* /codeUnits/,
				480	int32_t /length/,
				481	UChar32 codePoint,
				482	UConverterCallbackReason reason,
				483	UErrorCode *pErrorCode) {
				484	if(reason<=UCNV_IRREGULAR) {
				485	((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
				486	*pErrorCode=U_ZERO_ERROR; // skip
				487	} // else ignore the reset, close and clone calls.
				488	}
				489	U_CDECL_END
				490
				491	// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
				492	void
				493	ConversionTest::TestGetUnicodeSet2() {
				494	// Build a string with all code points.
				495	UChar32 cpLimit;
				496	int32_t s0Length;
				497	if(quick) {
				498	cpLimit=s0Length=0x10000; // BMP only
				499	} else {
				500	cpLimit=0x110000;
				501	s0Length=0x10000+0x200000; // BMP + surrogate pairs
				502	}
				503	UChar *s0=new UChar[s0Length];
				504	if(s0==NULL) {
				505	return;
				506	}
				507	UChar *s=s0;
				508	UChar32 c;
				509	UChar c2;
				510	// low BMP
				511	for(c=0; c<=0xd7ff; ++c) {
				512	*s++=(UChar)c;
				513	}
				514	// trail surrogates
				515	for(c=0xdc00; c<=0xdfff; ++c) {
				516	*s++=(UChar)c;
				517	}
				518	// lead surrogates
				519	// (after trails so that there is not even one surrogate pair in between)
				520	for(c=0xd800; c<=0xdbff; ++c) {
				521	*s++=(UChar)c;
				522	}
				523	// high BMP
				524	for(c=0xe000; c<=0xffff; ++c) {
				525	*s++=(UChar)c;
				526	}
				527	// supplementary code points = surrogate pairs
				528	if(cpLimit==0x110000) {
				529	for(c=0xd800; c<=0xdbff; ++c) {
				530	for(c2=0xdc00; c2<=0xdfff; ++c2) {
				531	*s++=(UChar)c;
				532	*s++=c2;
				533	}
				534	}
				535	}
				536
				537	static const char *const cnvNames[]={
				538	"UTF-8",
				539	"UTF-7",
				540	"UTF-16",
				541	"US-ASCII",
				542	"ISO-8859-1",
				543	"windows-1252",
				544	"Shift-JIS",
				545	"ibm-1390", // EBCDIC_STATEFUL table
				546	"ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
				547	"HZ",
				548	"ISO-2022-JP",
				549	"JIS7",
				550	"ISO-2022-CN",
				551	"ISO-2022-CN-EXT",
				552	"LMBCS"
				553	};
				554	LocalUConverterPointer cnv;
				555	char buffer[1024];
				556	int32_t i;
				557	for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
				558	UErrorCode errorCode=U_ZERO_ERROR;
				559	cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
				560	if(U_FAILURE(errorCode)) {
				561	errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
				562	continue;
				563	}
				564	UnicodeSet expected;
				565	ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
				566	if(U_FAILURE(errorCode)) {
				567	errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
				568	continue;
				569	}
				570	UConverterUnicodeSet which;
				571	for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
				572	if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	573	ucnv_setFallback(cnv.getAlias(), true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	574	}
				575	expected.add(0, cpLimit-1);
				576	s=s0;
				577	UBool flush;
				578	do {
				579	char *t=buffer;
				580	flush=(UBool)(s==s0+s0Length);
				581	ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
				582	if(U_FAILURE(errorCode)) {
				583	if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
				584	errorCode=U_ZERO_ERROR;
				585	continue;
				586	} else {
				587	break; // unexpected error, should not occur
				588	}
				589	}
				590	} while(!flush);
				591	UnicodeSet set;
				592	ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode);
				593	if(cpLimit<0x110000) {
				594	set.remove(cpLimit, 0x10ffff);
				595	}
				596	if(which==UCNV_ROUNDTRIP_SET) {
				597	// ignore PUA code points because they will be converted even if they
				598	// are fallbacks and when other fallbacks are turned off,
				599	// but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
				600	expected.remove(0xe000, 0xf8ff);
				601	expected.remove(0xf0000, 0xffffd);
				602	expected.remove(0x100000, 0x10fffd);
				603	set.remove(0xe000, 0xf8ff);
				604	set.remove(0xf0000, 0xffffd);
				605	set.remove(0x100000, 0x10fffd);
				606	}
				607	if(set!=expected) {
				608	// First try to see if we have different sets because ucnv_getUnicodeSet()
				609	// added strings: The above conversion method does not tell us what strings might be convertible.
				610	// Remove strings from the set and compare again.
				611	set.removeAllStrings();
				612	}
				613	if(set!=expected) {
				614	UnicodeSet diffSet;
				615	UnicodeString out;
				616
				617	// are there items that must be in the set but are not?
				618	(diffSet=expected).removeAll(set);
				619	if(!diffSet.isEmpty()) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	620	diffSet.toPattern(out, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	621	if(out.length()>100) {
				622	out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
				623	}
				624	errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
				625	cnvNames[i], which);
				626	errln(out);
				627	}
				628
				629	// are there items that must not be in the set but are?
				630	(diffSet=set).removeAll(expected);
				631	if(!diffSet.isEmpty()) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	632	diffSet.toPattern(out, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	633	if(out.length()>100) {
				634	out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
				635	}
				636	errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
				637	cnvNames[i], which);
				638	errln(out);
				639	}
				640	}
				641	}
				642	}
				643
				644	delete [] s0;
				645	}
				646
				647	// Test that all code points which have the default ignorable Unicode property
				648	// are ignored if they have no mapping.
				649	// If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT)
				650	// in ucnv_err.cpp should be updated.
				651	void
				652	ConversionTest::TestDefaultIgnorableCallback() {
				653	UErrorCode status = U_ZERO_ERROR;
				654	const char *cnv_name = "euc-jp-2007";
				655	const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
				656	const char *pattern_not_ignorable =
				657	"[[:^Default_Ignorable_Code_Point:]"
				658	// For test performance, skip large ranges that will likely remain unassigned
				659	// for a long time, and private use code points.
				660	"-[\\U00040000-\\U000DFFFF]-[:Co:]"
				661	"]";
				662
				663	LocalPointer<UnicodeSet> set_ignorable(new UnicodeSet(pattern_ignorable, status));
				664	if (U_FAILURE(status)) {
				665	dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
				666	return;
				667	}
				668
				669	LocalPointer<UnicodeSet> set_not_ignorable(new UnicodeSet(pattern_not_ignorable, status));
				670	if (U_FAILURE(status)) {
				671	dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
				672	return;
				673	}
				674
				675	LocalUConverterPointer cnv(cnv_open(cnv_name, status));
				676	if (U_FAILURE(status)) {
				677	dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
				678	return;
				679	}
				680
				681	// set callback for the converter
				682	ucnv_setFromUCallBack(cnv.getAlias(), UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
				683
				684	UChar32 input[1];
				685	char output[10];
				686	int32_t outputLength;
				687
				688	// test default ignorables are ignored
				689	UnicodeSetIterator iter(*set_ignorable);
				690	while (iter.next()) {
				691	status = U_ZERO_ERROR;
				692	outputLength= 0;
				693
				694	input[0] = iter.getCodepoint();
				695
				696	outputLength = ucnv_fromUChars(cnv.getAlias(), output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
				697	if (U_FAILURE(status) \|\| outputLength != 0) {
				698	errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
				699	}
				700	}
				701
				702	// test non-ignorables are not ignored
				703	iter.reset(*set_not_ignorable);
				704	while (iter.next()) {
				705	status = U_ZERO_ERROR;
				706	outputLength= 0;
				707
				708	input[0] = iter.getCodepoint();
				709
				710	if (input[0] == 0) {
				711	continue;
				712	}
				713
				714	outputLength = ucnv_fromUChars(cnv.getAlias(), output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
				715	if (U_FAILURE(status) \|\| outputLength <= 0) {
				716	errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
				717	}
				718	}
				719	}
				720
				721	void
				722	ConversionTest::TestUTF8ToUTF8Overflow() {
				723	IcuTestErrorCode errorCode(*this, "TestUTF8ToUTF8Overflow");
				724	LocalUConverterPointer cnv1(ucnv_open("UTF-8", errorCode));
				725	LocalUConverterPointer cnv2(ucnv_open("UTF-8", errorCode));
				726	static const char *text = "aä"; // ä: 2 bytes
				727	const char *source = text;
				728	const char *sourceLimit = text + strlen(text);
				729	char result[20];
				730	char *target = result;
				731	const char *targetLimit = result + sizeof(result);
				732	UChar buffer16[20];
				733	UChar *pivotSource = buffer16;
				734	UChar *pivotTarget = buffer16;
				735	const UChar *pivotLimit = buffer16 + UPRV_LENGTHOF(buffer16);
				736	int32_t length;
				737
				738	// Convert with insufficient target capacity.
				739	result[2] = 5;
				740	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				741	&target, result + 2, &source, sourceLimit,
				742	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	743	false, false, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	744	assertEquals("overflow", U_BUFFER_OVERFLOW_ERROR, errorCode.reset());
				745	length = (int32_t)(target - result);
				746	assertEquals("number of bytes written", 2, length);
				747	assertEquals("next byte not clobbered", 5, result[2]);
				748
				749	// Convert the rest and flush.
				750	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				751	&target, targetLimit, &source, sourceLimit,
				752	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	753	false, true, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	754
				755	assertSuccess("UTF-8->UTF-8", errorCode);
				756	length = (int32_t)(target - result);
				757	assertEquals("3 bytes", 3, length);
				758	if (length == 3) {
				759	assertTrue("result same as input", memcmp(text, result, length) == 0);
				760	}
				761
				762	ucnv_reset(cnv1.getAlias());
				763	ucnv_reset(cnv2.getAlias());
				764	memset(result, 0, sizeof(result));
				765	static const char *text2 = "a🚲"; // U+1F6B2 bicycle: 4 bytes
				766	source = text2;
				767	sourceLimit = text2 + strlen(text2);
				768	target = result;
				769	pivotSource = pivotTarget = buffer16;
				770
				771	// Convert with insufficient target capacity.
				772	result[3] = 5;
				773	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				774	&target, result + 3, &source, sourceLimit,
				775	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	776	false, false, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	777	assertEquals("text2 overflow", U_BUFFER_OVERFLOW_ERROR, errorCode.reset());
				778	length = (int32_t)(target - result);
				779	assertEquals("text2 number of bytes written", 3, length);
				780	assertEquals("text2 next byte not clobbered", 5, result[3]);
				781
				782	// Convert the rest and flush.
				783	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				784	&target, targetLimit, &source, sourceLimit,
				785	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	786	false, true, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	787
				788	assertSuccess("text2 UTF-8->UTF-8", errorCode);
				789	length = (int32_t)(target - result);
				790	assertEquals("text2 5 bytes", 5, length);
				791	if (length == 5) {
				792	assertTrue("text2 result same as input", memcmp(text2, result, length) == 0);
				793	}
				794
				795	ucnv_reset(cnv1.getAlias());
				796	ucnv_reset(cnv2.getAlias());
				797	memset(result, 0, sizeof(result));
				798	static const char *illFormed = "\xf1\x91\x93\x96\x91\x94"; // U+514D6 + two more trail bytes
				799	source = illFormed;
				800	sourceLimit = illFormed + strlen(illFormed);
				801	target = result;
				802	pivotSource = pivotTarget = buffer16;
				803
				804	ucnv_setToUCallBack(cnv1.getAlias(), UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr, nullptr, errorCode);
				805
				806	// Convert only two bytes and flush (but expect failure).
				807	char errorBytes[10];
				808	int8_t errorLength;
				809	result[0] = 5;
				810	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				811	&target, targetLimit, &source, source + 2,
				812	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	813	false, true, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	814	assertEquals("illFormed truncated", U_TRUNCATED_CHAR_FOUND, errorCode.reset());
				815	length = (int32_t)(target - result);
				816	assertEquals("illFormed number of bytes written", 0, length);
				817	errorLength = UPRV_LENGTHOF(errorBytes);
				818	ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
				819	assertEquals("illFormed truncated errorLength", 2, (int32_t)errorLength);
				820	if (errorLength == 2) {
				821	assertEquals("illFormed truncated errorBytes", 0xf191,
				822	((int32_t)(uint8_t)errorBytes[0] << 8) \| (uint8_t)errorBytes[1]);
				823	}
				824
				825	// Continue conversion starting with a trail byte.
				826	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				827	&target, targetLimit, &source, sourceLimit,
				828	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	829	false, true, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	830
				831	assertEquals("illFormed trail byte", U_ILLEGAL_CHAR_FOUND, errorCode.reset());
				832	length = (int32_t)(target - result);
				833	assertEquals("illFormed trail byte number of bytes written", 0, length);
				834	errorLength = UPRV_LENGTHOF(errorBytes);
				835	ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
				836	assertEquals("illFormed trail byte errorLength", 1, (int32_t)errorLength);
				837	if (errorLength == 1) {
				838	assertEquals("illFormed trail byte errorBytes", 0x93, (int32_t)(uint8_t)errorBytes[0]);
				839	}
				840	}
				841
				842	void
				843	ConversionTest::TestUTF8ToUTF8Streaming() {
				844	IcuTestErrorCode errorCode(*this, "TestUTF8ToUTF8Streaming");
				845	LocalUConverterPointer cnv1(ucnv_open("UTF-8", errorCode));
				846	LocalUConverterPointer cnv2(ucnv_open("UTF-8", errorCode));
				847
				848	// UTF8 encoded cyrillic part of 'Lorem ipsum'
				849	static const char* text =
				850	"\xd0\xb5\xd1\x82\x20\xd1\x81\xd1\x86\xd0\xb0\xd0\xb5\xd0\xb2\xd0"
				851	"\xbe\xd0\xbb\xd0\xb0\x20\xd1\x81\xd0\xb0\xd0\xb4\xd0\xb8\xd0\xbf"
				852	"\xd1\x81\xd1\x86\xd0\xb8\xd0\xbd\xd0\xb3\x20\xd0\xb0\xd1\x86\xd1"
				853	"\x86\xd0\xbe\xd0\xbc\xd0\xbc\xd0\xbe\xd0\xb4\xd0\xb0\xd1\x80\xd0"
				854	"\xb5\x20\xd1\x85\xd0\xb0\xd1\x81";
				855
				856	int32_t chunk1 = 25; // partial lead at the end: 0xd0
				857	int32_t chunk2 = 47; // partial tail at the beginning: 0xb0
				858
				859	char result[128];
				860
				861	int32_t sourceLen = (int32_t)strlen(text);
				862	const char* source = text;
				863	const char* sourceLimit = text + chunk1;
				864
				865	int32_t targetLen = sizeof(result);
				866	char* target = result;
				867	const char* targetLimit = result + targetLen;
				868
				869	UChar buffer16[20];
				870	UChar* pivotSource = buffer16;
				871	UChar* pivotTarget = buffer16;
				872	const UChar* pivotLimit = buffer16 + UPRV_LENGTHOF(buffer16);
				873
				874	int32_t length;
				875	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				876	&target, result + targetLen, &source, sourceLimit,
				877	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	878	false, false, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	879
				880	length = (int32_t)(target - result);
				881	targetLen -= length;
				882	assertEquals("First chunk -1 doesn't match converted length", chunk1 - 1, length);
				883
				884	source = text + chunk1;
				885	sourceLimit = source + chunk2;
				886
				887	// Convert the rest and flush.
				888	ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
				889	&target, targetLimit, &source, sourceLimit,
				890	buffer16, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	891	false, true, errorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	892
				893	length = (int32_t)(target - result - length);
				894	targetLen -= length;
				895	assertEquals("Second chunk + 2 doesn't match converted length", chunk2 + 1, length);
				896
				897	assertEquals("Full text length match", sourceLen, sizeof(result) - targetLen);
				898	assertSuccess("UTF-8->UTF-8", errorCode);
				899	}
				900
				901	// open testdata or ICU data converter ------------------------------------- ***
				902
				903	UConverter *
				904	ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
				905	if(name!=NULL && *name=='+') {
				906	// Converter names that start with '+' are ignored in ICU4J tests.
				907	++name;
				908	}
				909	if(name!=NULL && name=='') {
				910	/* loadTestData(): set the data directory */
				911	return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
				912	} else {
				913	return ucnv_open(name, &errorCode);
				914	}
				915	}
				916
				917	// output helpers ---------------------------------------------------------- ***
				918
				919	static inline char
				920	hexDigit(uint8_t digit) {
				921	return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
				922	}
				923
				924	static char *
				925	printBytes(const uint8_t bytes, int32_t length, char out) {
				926	uint8_t b;
				927
				928	if(length>0) {
				929	b=*bytes++;
				930	--length;
				931	*out++=hexDigit((uint8_t)(b>>4));
				932	*out++=hexDigit((uint8_t)(b&0xf));
				933	}
				934
				935	while(length>0) {
				936	b=*bytes++;
				937	--length;
				938	*out++=' ';
				939	*out++=hexDigit((uint8_t)(b>>4));
				940	*out++=hexDigit((uint8_t)(b&0xf));
				941	}
				942	*out++=0;
				943	return out;
				944	}
				945
				946	static char *
				947	printUnicode(const UChar unicode, int32_t length, char out) {
				948	UChar32 c;
				949	int32_t i;
				950
				951	for(i=0; i<length;) {
				952	if(i>0) {
				953	*out++=' ';
				954	}
				955	U16_NEXT(unicode, i, length, c);
				956	// write 4..6 digits
				957	if(c>=0x100000) {
				958	*out++='1';
				959	}
				960	if(c>=0x10000) {
				961	*out++=hexDigit((uint8_t)((c>>16)&0xf));
				962	}
				963	*out++=hexDigit((uint8_t)((c>>12)&0xf));
				964	*out++=hexDigit((uint8_t)((c>>8)&0xf));
				965	*out++=hexDigit((uint8_t)((c>>4)&0xf));
				966	*out++=hexDigit((uint8_t)(c&0xf));
				967	}
				968	*out++=0;
				969	return out;
				970	}
				971
				972	static char *
				973	printOffsets(const int32_t offsets, int32_t length, char out) {
				974	int32_t i, o, d;
				975
				976	if(offsets==NULL) {
				977	length=0;
				978	}
				979
				980	for(i=0; i<length; ++i) {
				981	if(i>0) {
				982	*out++=' ';
				983	}
				984	o=offsets[i];
				985
				986	// print all offsets with 2 characters each (-x, -9..99, xx)
				987	if(o<-9) {
				988	*out++='-';
				989	*out++='x';
				990	} else if(o<0) {
				991	*out++='-';
				992	*out++=(char)('0'-o);
				993	} else if(o<=99) {
				994	*out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
				995	*out++=(char)('0'+o%10);
				996	} else /* o>99 */ {
				997	*out++='x';
				998	*out++='x';
				999	}
				1000	}
				1001	*out++=0;
				1002	return out;
				1003	}
				1004
				1005	// toUnicode test worker functions ----------------------------------------- ***
				1006
				1007	static int32_t
				1008	stepToUnicode(ConversionCase &cc, UConverter *cnv,
				1009	UChar *result, int32_t resultCapacity,
				1010	int32_t resultOffsets, / also resultCapacity */
				1011	int32_t step,
				1012	UErrorCode *pErrorCode) {
				1013	const char source, sourceLimit, *bytesLimit;
				1014	UChar target, targetLimit, *resultLimit;
				1015	UBool flush;
				1016
				1017	source=(const char *)cc.bytes;
				1018	target=result;
				1019	bytesLimit=source+cc.bytesLength;
				1020	resultLimit=result+resultCapacity;
				1021
				1022	if(step>=0) {
				1023	// call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
				1024	// move only one buffer (in vs. out) at a time to be extra mean
				1025	// step==0 performs bulk conversion and generates offsets
				1026
				1027	// initialize the partial limits for the loop
				1028	if(step==0) {
				1029	// use the entire buffers
				1030	sourceLimit=bytesLimit;
				1031	targetLimit=resultLimit;
				1032	flush=cc.finalFlush;
				1033	} else {
				1034	// start with empty partial buffers
				1035	sourceLimit=source;
				1036	targetLimit=target;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1037	flush=false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1038
				1039	// output offsets only for bulk conversion
				1040	resultOffsets=NULL;
				1041	}
				1042
				1043	for(;;) {
				1044	// resetting the opposite conversion direction must not affect this one
				1045	ucnv_resetFromUnicode(cnv);
				1046
				1047	// convert
				1048	ucnv_toUnicode(cnv,
				1049	&target, targetLimit,
				1050	&source, sourceLimit,
				1051	resultOffsets,
				1052	flush, pErrorCode);
				1053
				1054	// check pointers and errors
				1055	if(source>sourceLimit \|\| target>targetLimit) {
				1056	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1057	break;
				1058	} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
				1059	if(target!=targetLimit) {
				1060	// buffer overflow must only be set when the target is filled
				1061	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1062	break;
				1063	} else if(targetLimit==resultLimit) {
				1064	// not just a partial overflow
				1065	break;
				1066	}
				1067
				1068	// the partial target is filled, set a new limit, reset the error and continue
				1069	targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
				1070	*pErrorCode=U_ZERO_ERROR;
				1071	} else if(U_FAILURE(*pErrorCode)) {
				1072	// some other error occurred, done
				1073	break;
				1074	} else {
				1075	if(source!=sourceLimit) {
				1076	// when no error occurs, then the input must be consumed
				1077	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1078	break;
				1079	}
				1080
				1081	if(sourceLimit==bytesLimit) {
				1082	// we are done
				1083	break;
				1084	}
				1085
				1086	// the partial conversion succeeded, set a new limit and continue
				1087	sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
				1088	flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
				1089	}
				1090	}
				1091	} else /* step<0 */ {
				1092	/*
				1093	* step==-1: call only ucnv_getNextUChar()
				1094	* otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
				1095	* if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
				1096	* else give it at most (-step-2)/2 bytes
				1097	*/
				1098	UChar32 c;
				1099
				1100	// end the loop by getting an index out of bounds error
				1101	for(;;) {
				1102	// resetting the opposite conversion direction must not affect this one
				1103	ucnv_resetFromUnicode(cnv);
				1104
				1105	// convert
				1106	if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
				1107	sourceLimit=source; // use sourceLimit not as a real limit
				1108	// but to remember the pre-getNextUChar source pointer
				1109	c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
				1110
				1111	// check pointers and errors
				1112	if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
				1113	if(source!=bytesLimit) {
				1114	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1115	} else {
				1116	*pErrorCode=U_ZERO_ERROR;
				1117	}
				1118	break;
				1119	} else if(U_FAILURE(*pErrorCode)) {
				1120	break;
				1121	}
				1122	// source may not move if c is from previous overflow
				1123
				1124	if(target==resultLimit) {
				1125	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1126	break;
				1127	}
				1128	if(c<=0xffff) {
				1129	*target++=(UChar)c;
				1130	} else {
				1131	*target++=U16_LEAD(c);
				1132	if(target==resultLimit) {
				1133	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1134	break;
				1135	}
				1136	*target++=U16_TRAIL(c);
				1137	}
				1138
				1139	// alternate between -n-1 and -n but leave -1 alone
				1140	if(step<-1) {
				1141	++step;
				1142	}
				1143	} else /* step is even */ {
				1144	// allow only one UChar output
				1145	targetLimit=target<resultLimit ? target+1 : resultLimit;
				1146
				1147	// as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
				1148	// and never output offsets
				1149	if(step==-2) {
				1150	sourceLimit=bytesLimit;
				1151	} else {
				1152	sourceLimit=source+(-step-2)/2;
				1153	if(sourceLimit>bytesLimit) {
				1154	sourceLimit=bytesLimit;
				1155	}
				1156	}
				1157
				1158	ucnv_toUnicode(cnv,
				1159	&target, targetLimit,
				1160	&source, sourceLimit,
				1161	NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
				1162
				1163	// check pointers and errors
				1164	if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
				1165	if(target!=targetLimit) {
				1166	// buffer overflow must only be set when the target is filled
				1167	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1168	break;
				1169	} else if(targetLimit==resultLimit) {
				1170	// not just a partial overflow
				1171	break;
				1172	}
				1173
				1174	// the partial target is filled, set a new limit and continue
				1175	*pErrorCode=U_ZERO_ERROR;
				1176	} else if(U_FAILURE(*pErrorCode)) {
				1177	// some other error occurred, done
				1178	break;
				1179	} else {
				1180	if(source!=sourceLimit) {
				1181	// when no error occurs, then the input must be consumed
				1182	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1183	break;
				1184	}
				1185
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1186	// we are done (flush==true) but we continue, to get the index out of bounds error above
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1187	}
				1188
				1189	--step;
				1190	}
				1191	}
				1192	}
				1193
				1194	return (int32_t)(target-result);
				1195	}
				1196
				1197	UBool
				1198	ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
				1199	// open the converter
				1200	IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
				1201	LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
				1202	// with no data, the above crashes with "pointer being freed was not allocated" for charset "x11-compound-text", see #13078
				1203	if(errorCode.isFailure()) {
				1204	errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
				1205	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
				1206	errorCode.reset();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1207	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1208	}
				1209
				1210	// set the callback
				1211	if(callback!=NULL) {
				1212	ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode);
				1213	if(U_FAILURE(errorCode)) {
				1214	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
				1215	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1216	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1217	}
				1218	}
				1219
				1220	int32_t resultOffsets[256];
				1221	UChar result[256];
				1222	int32_t resultLength;
				1223	UBool ok;
				1224
				1225	static const struct {
				1226	int32_t step;
				1227	const char *name;
				1228	} steps[]={
				1229	{ 0, "bulk" }, // must be first for offsets to be checked
				1230	{ 1, "step=1" },
				1231	{ 3, "step=3" },
				1232	{ 7, "step=7" },
				1233	{ -1, "getNext" },
				1234	{ -2, "toU(bulk)+getNext" },
				1235	{ -3, "getNext+toU(bulk)" },
				1236	{ -4, "toU(1)+getNext" },
				1237	{ -5, "getNext+toU(1)" },
				1238	{ -12, "toU(5)+getNext" },
				1239	{ -13, "getNext+toU(5)" },
				1240	};
				1241	int32_t i, step;
				1242
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1243	ok=true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1244	for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
				1245	step=steps[i].step;
				1246	if(step<0 && !cc.finalFlush) {
				1247	// skip ucnv_getNextUChar() if !finalFlush because
				1248	// ucnv_getNextUChar() always implies flush
				1249	continue;
				1250	}
				1251	if(step!=0) {
				1252	// bulk test is first, then offsets are not checked any more
				1253	cc.offsets=NULL;
				1254	}
				1255	else {
				1256	for (int32_t i = 0; i < UPRV_LENGTHOF(resultOffsets); i++) {
				1257	resultOffsets[i] = -1;
				1258	}
				1259	}
				1260	for (int32_t i = 0; i < UPRV_LENGTHOF(result); i++) {
				1261	result[i] = -1;
				1262	}
				1263	errorCode.reset();
				1264	resultLength=stepToUnicode(cc, cnv.getAlias(),
				1265	result, UPRV_LENGTHOF(result),
				1266	step==0 ? resultOffsets : NULL,
				1267	step, errorCode);
				1268	ok=checkToUnicode(
				1269	cc, cnv.getAlias(), steps[i].name,
				1270	result, resultLength,
				1271	cc.offsets!=NULL ? resultOffsets : NULL,
				1272	errorCode);
				1273	if(errorCode.isFailure() \|\| !cc.finalFlush) {
				1274	// reset if an error occurred or we did not flush
				1275	// otherwise do nothing to make sure that flushing resets
				1276	ucnv_resetToUnicode(cnv.getAlias());
				1277	}
				1278	if (cc.offsets != NULL && resultOffsets[resultLength] != -1) {
				1279	errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
				1280	cc.caseNr, cc.charset, resultLength);
				1281	}
				1282	if (result[resultLength] != (UChar)-1) {
				1283	errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
				1284	cc.caseNr, cc.charset, resultLength);
				1285	}
				1286	}
				1287
				1288	// not a real loop, just a convenience for breaking out of the block
				1289	while(ok && cc.finalFlush) {
				1290	// test ucnv_toUChars()
				1291	memset(result, 0, sizeof(result));
				1292
				1293	errorCode.reset();
				1294	resultLength=ucnv_toUChars(cnv.getAlias(),
				1295	result, UPRV_LENGTHOF(result),
				1296	(const char *)cc.bytes, cc.bytesLength,
				1297	errorCode);
				1298	ok=checkToUnicode(
				1299	cc, cnv.getAlias(), "toUChars",
				1300	result, resultLength,
				1301	NULL,
				1302	errorCode);
				1303	if(!ok) {
				1304	break;
				1305	}
				1306
				1307	// test preflighting
				1308	// keep the correct result for simple checking
				1309	errorCode.reset();
				1310	resultLength=ucnv_toUChars(cnv.getAlias(),
				1311	NULL, 0,
				1312	(const char *)cc.bytes, cc.bytesLength,
				1313	errorCode);
				1314	if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING \|\| errorCode.get()==U_BUFFER_OVERFLOW_ERROR) {
				1315	errorCode.reset();
				1316	}
				1317	ok=checkToUnicode(
				1318	cc, cnv.getAlias(), "preflight toUChars",
				1319	result, resultLength,
				1320	NULL,
				1321	errorCode);
				1322	break;
				1323	}
				1324
				1325	errorCode.reset(); // all errors have already been reported
				1326	return ok;
				1327	}
				1328
				1329	UBool
				1330	ConversionTest::checkToUnicode(ConversionCase &cc, UConverter cnv, const char name,
				1331	const UChar *result, int32_t resultLength,
				1332	const int32_t *resultOffsets,
				1333	UErrorCode resultErrorCode) {
				1334	char resultInvalidChars[8];
				1335	int8_t resultInvalidLength;
				1336	UErrorCode errorCode;
				1337
				1338	const char *msg;
				1339
				1340	// reset the message; NULL will mean "ok"
				1341	msg=NULL;
				1342
				1343	errorCode=U_ZERO_ERROR;
				1344	resultInvalidLength=sizeof(resultInvalidChars);
				1345	ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
				1346	if(U_FAILURE(errorCode)) {
				1347	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
				1348	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1349	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1350	}
				1351
				1352	// check everything that might have gone wrong
				1353	if(cc.unicodeLength!=resultLength) {
				1354	msg="wrong result length";
				1355	} else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
				1356	msg="wrong result string";
				1357	} else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLengthsizeof(cc.offsets))) {
				1358	msg="wrong offsets";
				1359	} else if(cc.outErrorCode!=resultErrorCode) {
				1360	msg="wrong error code";
				1361	} else if(cc.invalidLength!=resultInvalidLength) {
				1362	msg="wrong length of last invalid input";
				1363	} else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
				1364	msg="wrong last invalid input";
				1365	}
				1366
				1367	if(msg==NULL) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1368	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1369	} else {
				1370	char buffer[2000]; // one buffer for all strings
				1371	char s, bytesString, unicodeString, resultString,
				1372	offsetsString, resultOffsetsString,
				1373	invalidCharsString, resultInvalidCharsString;
				1374
				1375	bytesString=s=buffer;
				1376	s=printBytes(cc.bytes, cc.bytesLength, bytesString);
				1377	s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
				1378	s=printUnicode(result, resultLength, resultString=s);
				1379	s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
				1380	s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
				1381	s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
				1382	s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
				1383
				1384	if((s-buffer)>(int32_t)sizeof(buffer)) {
				1385	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
				1386	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
				1387	exit(1);
				1388	}
				1389
				1390	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
				1391	" bytes <%s>[%d]\n"
				1392	" expected <%s>[%d]\n"
				1393	" result <%s>[%d]\n"
				1394	" offsets <%s>\n"
				1395	" result offsets <%s>\n"
				1396	" error code expected %s got %s\n"
				1397	" invalidChars expected <%s> got <%s>\n",
				1398	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
				1399	bytesString, cc.bytesLength,
				1400	unicodeString, cc.unicodeLength,
				1401	resultString, resultLength,
				1402	offsetsString,
				1403	resultOffsetsString,
				1404	u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
				1405	invalidCharsString, resultInvalidCharsString);
				1406
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1407	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1408	}
				1409	}
				1410
				1411	// fromUnicode test worker functions --------------------------------------- ***
				1412
				1413	static int32_t
				1414	stepFromUTF8(ConversionCase &cc,
				1415	UConverter utf8Cnv, UConverter cnv,
				1416	char *result, int32_t resultCapacity,
				1417	int32_t step,
				1418	UErrorCode *pErrorCode) {
				1419	const char source, sourceLimit, *utf8Limit;
				1420	UChar pivotBuffer[32];
				1421	UChar pivotSource, pivotTarget, *pivotLimit;
				1422	char target, targetLimit, *resultLimit;
				1423	UBool flush;
				1424
				1425	source=cc.utf8;
				1426	pivotSource=pivotTarget=pivotBuffer;
				1427	target=result;
				1428	utf8Limit=source+cc.utf8Length;
				1429	resultLimit=result+resultCapacity;
				1430
				1431	// call ucnv_convertEx() with in/out buffers no larger than (step) at a time
				1432	// move only one buffer (in vs. out) at a time to be extra mean
				1433	// step==0 performs bulk conversion
				1434
				1435	// initialize the partial limits for the loop
				1436	if(step==0) {
				1437	// use the entire buffers
				1438	sourceLimit=utf8Limit;
				1439	targetLimit=resultLimit;
				1440	flush=cc.finalFlush;
				1441
				1442	pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
				1443	} else {
				1444	// start with empty partial buffers
				1445	sourceLimit=source;
				1446	targetLimit=target;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1447	flush=false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1448
				1449	// empty pivot is not allowed, make it of length step
				1450	pivotLimit=pivotBuffer+step;
				1451	}
				1452
				1453	for(;;) {
				1454	// resetting the opposite conversion direction must not affect this one
				1455	ucnv_resetFromUnicode(utf8Cnv);
				1456	ucnv_resetToUnicode(cnv);
				1457
				1458	// convert
				1459	ucnv_convertEx(cnv, utf8Cnv,
				1460	&target, targetLimit,
				1461	&source, sourceLimit,
				1462	pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1463	false, flush, pErrorCode);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1464
				1465	// check pointers and errors
				1466	if(source>sourceLimit \|\| target>targetLimit) {
				1467	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1468	break;
				1469	} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
				1470	if(target!=targetLimit) {
				1471	// buffer overflow must only be set when the target is filled
				1472	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1473	break;
				1474	} else if(targetLimit==resultLimit) {
				1475	// not just a partial overflow
				1476	break;
				1477	}
				1478
				1479	// the partial target is filled, set a new limit, reset the error and continue
				1480	targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
				1481	*pErrorCode=U_ZERO_ERROR;
				1482	} else if(U_FAILURE(*pErrorCode)) {
				1483	if(pivotSource==pivotBuffer) {
				1484	// toUnicode error, should not occur
				1485	// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
				1486	break;
				1487	} else {
				1488	// fromUnicode error
				1489	// some other error occurred, done
				1490	break;
				1491	}
				1492	} else {
				1493	if(source!=sourceLimit) {
				1494	// when no error occurs, then the input must be consumed
				1495	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1496	break;
				1497	}
				1498
				1499	if(sourceLimit==utf8Limit) {
				1500	// we are done
				1501	if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
				1502	// ucnv_convertEx() warns about not terminating the output
				1503	// but ucnv_fromUnicode() does not and so
				1504	// checkFromUnicode() does not expect it
				1505	*pErrorCode=U_ZERO_ERROR;
				1506	}
				1507	break;
				1508	}
				1509
				1510	// the partial conversion succeeded, set a new limit and continue
				1511	sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
				1512	flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
				1513	}
				1514	}
				1515
				1516	return (int32_t)(target-result);
				1517	}
				1518
				1519	static int32_t
				1520	stepFromUnicode(ConversionCase &cc, UConverter *cnv,
				1521	char *result, int32_t resultCapacity,
				1522	int32_t resultOffsets, / also resultCapacity */
				1523	int32_t step,
				1524	UErrorCode *pErrorCode) {
				1525	const UChar source, sourceLimit, *unicodeLimit;
				1526	char target, targetLimit, *resultLimit;
				1527	UBool flush;
				1528
				1529	source=cc.unicode;
				1530	target=result;
				1531	unicodeLimit=source+cc.unicodeLength;
				1532	resultLimit=result+resultCapacity;
				1533
				1534	// call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
				1535	// move only one buffer (in vs. out) at a time to be extra mean
				1536	// step==0 performs bulk conversion and generates offsets
				1537
				1538	// initialize the partial limits for the loop
				1539	if(step==0) {
				1540	// use the entire buffers
				1541	sourceLimit=unicodeLimit;
				1542	targetLimit=resultLimit;
				1543	flush=cc.finalFlush;
				1544	} else {
				1545	// start with empty partial buffers
				1546	sourceLimit=source;
				1547	targetLimit=target;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1548	flush=false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1549
				1550	// output offsets only for bulk conversion
				1551	resultOffsets=NULL;
				1552	}
				1553
				1554	for(;;) {
				1555	// resetting the opposite conversion direction must not affect this one
				1556	ucnv_resetToUnicode(cnv);
				1557
				1558	// convert
				1559	ucnv_fromUnicode(cnv,
				1560	&target, targetLimit,
				1561	&source, sourceLimit,
				1562	resultOffsets,
				1563	flush, pErrorCode);
				1564
				1565	// check pointers and errors
				1566	if(source>sourceLimit \|\| target>targetLimit) {
				1567	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1568	break;
				1569	} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
				1570	if(target!=targetLimit) {
				1571	// buffer overflow must only be set when the target is filled
				1572	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1573	break;
				1574	} else if(targetLimit==resultLimit) {
				1575	// not just a partial overflow
				1576	break;
				1577	}
				1578
				1579	// the partial target is filled, set a new limit, reset the error and continue
				1580	targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
				1581	*pErrorCode=U_ZERO_ERROR;
				1582	} else if(U_FAILURE(*pErrorCode)) {
				1583	// some other error occurred, done
				1584	break;
				1585	} else {
				1586	if(source!=sourceLimit) {
				1587	// when no error occurs, then the input must be consumed
				1588	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1589	break;
				1590	}
				1591
				1592	if(sourceLimit==unicodeLimit) {
				1593	// we are done
				1594	break;
				1595	}
				1596
				1597	// the partial conversion succeeded, set a new limit and continue
				1598	sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
				1599	flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
				1600	}
				1601	}
				1602
				1603	return (int32_t)(target-result);
				1604	}
				1605
				1606	UBool
				1607	ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
				1608	UConverter *cnv;
				1609	UErrorCode errorCode;
				1610
				1611	// open the converter
				1612	errorCode=U_ZERO_ERROR;
				1613	cnv=cnv_open(cc.charset, errorCode);
				1614	if(U_FAILURE(errorCode)) {
				1615	errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
				1616	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1617	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1618	}
				1619	ucnv_resetToUnicode(utf8Cnv);
				1620
				1621	// set the callback
				1622	if(callback!=NULL) {
				1623	ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
				1624	if(U_FAILURE(errorCode)) {
				1625	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
				1626	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
				1627	ucnv_close(cnv);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1628	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1629	}
				1630	}
				1631
				1632	// set the fallbacks flag
				1633	// TODO change with Jitterbug 2401, then add a similar call for toUnicode too
				1634	ucnv_setFallback(cnv, cc.fallbacks);
				1635
				1636	// set the subchar
				1637	int32_t length;
				1638
				1639	if(cc.setSub>0) {
				1640	length=(int32_t)strlen(cc.subchar);
				1641	ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
				1642	if(U_FAILURE(errorCode)) {
				1643	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
				1644	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
				1645	ucnv_close(cnv);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1646	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1647	}
				1648	} else if(cc.setSub<0) {
				1649	ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
				1650	if(U_FAILURE(errorCode)) {
				1651	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
				1652	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
				1653	ucnv_close(cnv);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1654	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1655	}
				1656	}
				1657
				1658	// convert unicode to utf8
				1659	char utf8[256];
				1660	cc.utf8=utf8;
				1661	u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
				1662	cc.unicode, cc.unicodeLength,
				1663	&errorCode);
				1664	if(U_FAILURE(errorCode)) {
				1665	// skip UTF-8 testing of a string with an unpaired surrogate,
				1666	// or of one that's too long
				1667	// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
				1668	cc.utf8Length=-1;
				1669	}
				1670
				1671	int32_t resultOffsets[256];
				1672	char result[256];
				1673	int32_t resultLength;
				1674	UBool ok;
				1675
				1676	static const struct {
				1677	int32_t step;
				1678	const char name, utf8Name;
				1679	} steps[]={
				1680	{ 0, "bulk", "utf8" }, // must be first for offsets to be checked
				1681	{ 1, "step=1", "utf8 step=1" },
				1682	{ 3, "step=3", "utf8 step=3" },
				1683	{ 7, "step=7", "utf8 step=7" }
				1684	};
				1685	int32_t i, step;
				1686
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1687	ok=true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1688	for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
				1689	step=steps[i].step;
				1690	for (int32_t i = 0; i < UPRV_LENGTHOF(resultOffsets); i++) {
				1691	resultOffsets[i] = -1;
				1692	}
				1693	for (int32_t i = 0; i < UPRV_LENGTHOF(result); i++) {
				1694	result[i] = -1;
				1695	}
				1696	errorCode=U_ZERO_ERROR;
				1697	resultLength=stepFromUnicode(cc, cnv,
				1698	result, UPRV_LENGTHOF(result),
				1699	step==0 ? resultOffsets : NULL,
				1700	step, &errorCode);
				1701	ok=checkFromUnicode(
				1702	cc, cnv, steps[i].name,
				1703	(uint8_t *)result, resultLength,
				1704	cc.offsets!=NULL ? resultOffsets : NULL,
				1705	errorCode);
				1706	if(U_FAILURE(errorCode) \|\| !cc.finalFlush) {
				1707	// reset if an error occurred or we did not flush
				1708	// otherwise do nothing to make sure that flushing resets
				1709	ucnv_resetFromUnicode(cnv);
				1710	}
				1711	if (resultOffsets[resultLength] != -1) {
				1712	errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
				1713	cc.caseNr, cc.charset, resultLength);
				1714	}
				1715	if (result[resultLength] != (char)-1) {
				1716	errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
				1717	cc.caseNr, cc.charset, resultLength);
				1718	}
				1719
				1720	// bulk test is first, then offsets are not checked any more
				1721	cc.offsets=NULL;
				1722
				1723	// test direct conversion from UTF-8
				1724	if(cc.utf8Length>=0) {
				1725	errorCode=U_ZERO_ERROR;
				1726	resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
				1727	result, UPRV_LENGTHOF(result),
				1728	step, &errorCode);
				1729	ok=checkFromUnicode(
				1730	cc, cnv, steps[i].utf8Name,
				1731	(uint8_t *)result, resultLength,
				1732	NULL,
				1733	errorCode);
				1734	if(U_FAILURE(errorCode) \|\| !cc.finalFlush) {
				1735	// reset if an error occurred or we did not flush
				1736	// otherwise do nothing to make sure that flushing resets
				1737	ucnv_resetToUnicode(utf8Cnv);
				1738	ucnv_resetFromUnicode(cnv);
				1739	}
				1740	}
				1741	}
				1742
				1743	// not a real loop, just a convenience for breaking out of the block
				1744	while(ok && cc.finalFlush) {
				1745	// test ucnv_fromUChars()
				1746	memset(result, 0, sizeof(result));
				1747
				1748	errorCode=U_ZERO_ERROR;
				1749	resultLength=ucnv_fromUChars(cnv,
				1750	result, UPRV_LENGTHOF(result),
				1751	cc.unicode, cc.unicodeLength,
				1752	&errorCode);
				1753	ok=checkFromUnicode(
				1754	cc, cnv, "fromUChars",
				1755	(uint8_t *)result, resultLength,
				1756	NULL,
				1757	errorCode);
				1758	if(!ok) {
				1759	break;
				1760	}
				1761
				1762	// test preflighting
				1763	// keep the correct result for simple checking
				1764	errorCode=U_ZERO_ERROR;
				1765	resultLength=ucnv_fromUChars(cnv,
				1766	NULL, 0,
				1767	cc.unicode, cc.unicodeLength,
				1768	&errorCode);
				1769	if(errorCode==U_STRING_NOT_TERMINATED_WARNING \|\| errorCode==U_BUFFER_OVERFLOW_ERROR) {
				1770	errorCode=U_ZERO_ERROR;
				1771	}
				1772	ok=checkFromUnicode(
				1773	cc, cnv, "preflight fromUChars",
				1774	(uint8_t *)result, resultLength,
				1775	NULL,
				1776	errorCode);
				1777	break;
				1778	}
				1779
				1780	ucnv_close(cnv);
				1781	return ok;
				1782	}
				1783
				1784	UBool
				1785	ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter cnv, const char name,
				1786	const uint8_t *result, int32_t resultLength,
				1787	const int32_t *resultOffsets,
				1788	UErrorCode resultErrorCode) {
				1789	UChar resultInvalidUChars[8];
				1790	int8_t resultInvalidLength;
				1791	UErrorCode errorCode;
				1792
				1793	const char *msg;
				1794
				1795	// reset the message; NULL will mean "ok"
				1796	msg=NULL;
				1797
				1798	errorCode=U_ZERO_ERROR;
				1799	resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
				1800	ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
				1801	if(U_FAILURE(errorCode)) {
				1802	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
				1803	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1804	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1805	}
				1806
				1807	// check everything that might have gone wrong
				1808	if(cc.bytesLength!=resultLength) {
				1809	msg="wrong result length";
				1810	} else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
				1811	msg="wrong result string";
				1812	} else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLengthsizeof(cc.offsets))) {
				1813	msg="wrong offsets";
				1814	} else if(cc.outErrorCode!=resultErrorCode) {
				1815	msg="wrong error code";
				1816	} else if(cc.invalidLength!=resultInvalidLength) {
				1817	msg="wrong length of last invalid input";
				1818	} else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
				1819	msg="wrong last invalid input";
				1820	}
				1821
				1822	if(msg==NULL) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1823	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1824	} else {
				1825	char buffer[2000]; // one buffer for all strings
				1826	char s, unicodeString, bytesString, resultString,
				1827	offsetsString, resultOffsetsString,
				1828	invalidCharsString, resultInvalidUCharsString;
				1829
				1830	unicodeString=s=buffer;
				1831	s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
				1832	s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
				1833	s=printBytes(result, resultLength, resultString=s);
				1834	s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
				1835	s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
				1836	s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
				1837	s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
				1838
				1839	if((s-buffer)>(int32_t)sizeof(buffer)) {
				1840	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
				1841	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
				1842	exit(1);
				1843	}
				1844
				1845	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
				1846	" unicode <%s>[%d]\n"
				1847	" expected <%s>[%d]\n"
				1848	" result <%s>[%d]\n"
				1849	" offsets <%s>\n"
				1850	" result offsets <%s>\n"
				1851	" error code expected %s got %s\n"
				1852	" invalidChars expected <%s> got <%s>\n",
				1853	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
				1854	unicodeString, cc.unicodeLength,
				1855	bytesString, cc.bytesLength,
				1856	resultString, resultLength,
				1857	offsetsString,
				1858	resultOffsetsString,
				1859	u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
				1860	invalidCharsString, resultInvalidUCharsString);
				1861
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1862	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1863	}
				1864	}
				1865
				1866	#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */