Blame - source/test/intltest/normconf.cpp - chromium.googlesource.com/chromium/deps/icu

blob: e2ddc0d49ea87e012cd7c6ba4275dfc8d755233c [file] [log] [blame]

Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	************************************************************************
				5	* Copyright (c) 1997-2016, International Business Machines
				6	* Corporation and others. All Rights Reserved.
				7	************************************************************************
				8	*/
				9
				10	#include "unicode/utypes.h"
				11
				12	#if !UCONFIG_NO_NORMALIZATION
				13
				14	#include <string>
				15	#include "unicode/bytestream.h"
				16	#include "unicode/edits.h"
				17	#include "unicode/uchar.h"
				18	#include "unicode/normalizer2.h"
				19	#include "unicode/normlzr.h"
				20	#include "unicode/uniset.h"
				21	#include "unicode/putil.h"
				22	#include "cmemory.h"
				23	#include "cstring.h"
				24	#include "filestrm.h"
				25	#include "normconf.h"
				26	#include "uassert.h"
				27	#include <stdio.h>
				28
				29	void NormalizerConformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /par/) {
				30	TESTCASE_AUTO_BEGIN;
				31	TESTCASE_AUTO(TestConformance);
				32	TESTCASE_AUTO(TestConformance32);
				33	TESTCASE_AUTO(TestCase6);
				34	TESTCASE_AUTO_END;
				35	}
				36
				37	#define FIELD_COUNT 5
				38
				39	NormalizerConformanceTest::NormalizerConformanceTest() :
				40	normalizer(UnicodeString(), UNORM_NFC) {
				41	UErrorCode errorCode = U_ZERO_ERROR;
				42	nfc = Normalizer2::getNFCInstance(errorCode);
				43	nfd = Normalizer2::getNFDInstance(errorCode);
				44	nfkc = Normalizer2::getNFKCInstance(errorCode);
				45	nfkd = Normalizer2::getNFKDInstance(errorCode);
				46	assertSuccess("", errorCode, true, __FILE__, __LINE__);
				47	}
				48
				49	NormalizerConformanceTest::~NormalizerConformanceTest() {}
				50
				51	// more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
				52	static const char *moreCases[]={
				53	// Markus 2001aug30
				54	"0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
				55
				56	// Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
				57	"0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
				58	};
				59
				60	void NormalizerConformanceTest::compare(const UnicodeString& s1, const UnicodeString& s2){
				61	UErrorCode status=U_ZERO_ERROR;
				62	// TODO: Re-enable this tests after UTC fixes UAX 21
				63	if(s1.indexOf((UChar32)0x0345)>=0)return;
				64	if(Normalizer::compare(s1,s2,U_FOLD_CASE_DEFAULT,status)!=0){
				65	errln("Normalizer::compare() failed for s1: " + prettify(s1) + " s2: " +prettify(s2));
				66	}
				67	}
				68
				69	FileStream *
				70	NormalizerConformanceTest::openNormalizationTestFile(const char *filename) {
				71	char unidataPath[2000];
				72	const char *folder;
				73	FileStream *input;
				74	UErrorCode errorCode;
				75
				76	// look inside ICU_DATA first
				77	folder=pathToDataDirectory();
				78	if(folder!=NULL) {
				79	strcpy(unidataPath, folder);
				80	strcat(unidataPath, "unidata" U_FILE_SEP_STRING);
				81	strcat(unidataPath, filename);
				82	input=T_FileStream_open(unidataPath, "rb");
				83	if(input!=NULL) {
				84	return input;
				85	}
				86	}
				87
				88	// find icu/source/data/unidata relative to the test data
				89	errorCode=U_ZERO_ERROR;
				90	folder=loadTestData(errorCode);
				91	if(U_SUCCESS(errorCode)) {
				92	strcpy(unidataPath, folder);
				93	strcat(unidataPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
				94	U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
				95	U_FILE_SEP_STRING "data" U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
				96	strcat(unidataPath, filename);
				97	input=T_FileStream_open(unidataPath, "rb");
				98	if(input!=NULL) {
				99	return input;
				100	}
				101	}
				102
				103	// look in icu/source/test/testdata/out/build
				104	errorCode=U_ZERO_ERROR;
				105	folder=loadTestData(errorCode);
				106	if(U_SUCCESS(errorCode)) {
				107	strcpy(unidataPath, folder);
				108	strcat(unidataPath, U_FILE_SEP_STRING);
				109	strcat(unidataPath, filename);
				110	input=T_FileStream_open(unidataPath, "rb");
				111	if(input!=NULL) {
				112	return input;
				113	}
				114	}
				115
				116	// look in icu/source/test/testdata
				117	errorCode=U_ZERO_ERROR;
				118	folder=loadTestData(errorCode);
				119	if(U_SUCCESS(errorCode)) {
				120	strcpy(unidataPath, folder);
				121	strcat(unidataPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING);
				122	strcat(unidataPath, filename);
				123	input=T_FileStream_open(unidataPath, "rb");
				124	if(input!=NULL) {
				125	return input;
				126	}
				127	}
				128
				129	// find icu/source/data/unidata relative to U_TOPSRCDIR
				130	#if defined(U_TOPSRCDIR)
				131	strcpy(unidataPath, U_TOPSRCDIR U_FILE_SEP_STRING "data" U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
				132	strcat(unidataPath, filename);
				133	input=T_FileStream_open(unidataPath, "rb");
				134	if(input!=NULL) {
				135	return input;
				136	}
				137
				138	strcpy(unidataPath, U_TOPSRCDIR U_FILE_SEP_STRING "test" U_FILE_SEP_STRING "testdata" U_FILE_SEP_STRING);
				139	strcat(unidataPath, filename);
				140	input=T_FileStream_open(unidataPath, "rb");
				141	if(input!=NULL) {
				142	return input;
				143	}
				144	#endif
				145
				146	dataerrln("Failed to open %s", filename);
				147	return NULL;
				148	}
				149
				150	/**
				151	* Test the conformance of Normalizer to
				152	* http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
				153	*/
				154	void NormalizerConformanceTest::TestConformance() {
				155	TestConformance(openNormalizationTestFile("NormalizationTest.txt"), 0);
				156	}
				157
				158	void NormalizerConformanceTest::TestConformance32() {
				159	TestConformance(openNormalizationTestFile("NormalizationTest-3.2.0.txt"), UNORM_UNICODE_3_2);
				160	}
				161
				162	void NormalizerConformanceTest::TestConformance(FileStream *input, int32_t options) {
				163	enum { BUF_SIZE = 1024 };
				164	char lineBuf[BUF_SIZE];
				165	UnicodeString fields[FIELD_COUNT];
				166	UErrorCode status = U_ZERO_ERROR;
				167	int32_t passCount = 0;
				168	int32_t failCount = 0;
				169	UChar32 c;
				170
				171	if(input==NULL) {
				172	return;
				173	}
				174
				175	// UnicodeSet for all code points that are not mentioned in NormalizationTest.txt
				176	UnicodeSet other(0, 0x10ffff);
				177
				178	int32_t count, countMoreCases = UPRV_LENGTHOF(moreCases);
				179	for (count = 1;;++count) {
				180	if (!T_FileStream_eof(input)) {
				181	T_FileStream_readLine(input, lineBuf, (int32_t)sizeof(lineBuf));
				182	} else {
				183	// once NormalizationTest.txt is finished, use moreCases[]
				184	if(count > countMoreCases) {
				185	count = 0;
				186	} else if(count == countMoreCases) {
				187	// all done
				188	break;
				189	}
				190	uprv_strcpy(lineBuf, moreCases[count]);
				191	}
				192	if (lineBuf[0] == 0 \|\| lineBuf[0] == '\n' \|\| lineBuf[0] == '\r') continue;
				193
				194	// Expect 5 columns of this format:
				195	// 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
				196
				197	// Parse out the comment.
				198	if (lineBuf[0] == '#') continue;
				199
				200	// Read separator lines starting with '@'
				201	if (lineBuf[0] == '@') {
				202	logln(lineBuf);
				203	continue;
				204	}
				205
				206	// Parse out the fields
				207	if (!hexsplit(lineBuf, ';', fields, FIELD_COUNT)) {
				208	errln((UnicodeString)"Unable to parse line " + count);
				209	break; // Syntax error
				210	}
				211
				212	// Remove a single code point from the "other" UnicodeSet
				213	if(fields[0].length()==fields[0].moveIndex32(0, 1)) {
				214	c=fields[0].char32At(0);
				215	if(0xac20<=c && c<=0xd73f && quick) {
				216	// not an exhaustive test run: skip most Hangul syllables
				217	if(c==0xac20) {
				218	other.remove(0xac20, 0xd73f);
				219	}
				220	continue;
				221	}
				222	other.remove(c);
				223	}
				224
				225	if (checkConformance(fields, lineBuf, options, status)) {
				226	++passCount;
				227	} else {
				228	++failCount;
				229	if(status == U_FILE_ACCESS_ERROR) {
				230	dataerrln("Something is wrong with the normalizer, skipping the rest of the test.");
				231	break;
				232	}
				233	}
				234	if ((count % 1000) == 0) {
				235	logln("Line %d", count);
				236	}
				237	}
				238
				239	T_FileStream_close(input);
				240
				241	/*
				242	* Test that all characters that are not mentioned
				243	* as single code points in column 1
				244	* do not change under any normalization.
				245	*/
				246
				247	// remove U+ffff because that is the end-of-iteration sentinel value
				248	other.remove(0xffff);
				249
				250	for(c=0; c<=0x10ffff; quick ? c+=113 : ++c) {
				251	if(0x30000<=c && c<0xe0000) {
				252	c=0xe0000;
				253	}
				254	if(!other.contains(c)) {
				255	continue;
				256	}
				257
				258	fields[0]=fields[1]=fields[2]=fields[3]=fields[4].setTo(c);
				259	sprintf(lineBuf, "not mentioned code point U+%04lx", (long)c);
				260
				261	if (checkConformance(fields, lineBuf, options, status)) {
				262	++passCount;
				263	} else {
				264	++failCount;
				265	if(status == U_FILE_ACCESS_ERROR) {
				266	dataerrln("Something is wrong with the normalizer, skipping the rest of the test.: %s", u_errorName(status));
				267	break;
				268	}
				269	}
				270	if ((c % 0x1000) == 0) {
				271	logln("Code point U+%04lx", c);
				272	}
				273	}
				274
				275	if (failCount != 0) {
				276	dataerrln((UnicodeString)"Total: " + failCount + " lines/code points failed, " +
				277	passCount + " lines/code points passed");
				278	} else {
				279	logln((UnicodeString)"Total: " + passCount + " lines/code points passed");
				280	}
				281	}
				282
				283	namespace {
				284
				285	UBool isNormalizedUTF8(const Normalizer2 *norm2, const UnicodeString &s, UErrorCode &errorCode) {
				286	if (norm2 == nullptr) {
				287	return true;
				288	}
				289	std::string s8;
				290	return norm2->isNormalizedUTF8(s.toUTF8String(s8), errorCode);
				291	}
				292
				293	} // namespace
				294
				295	/**
				296	* Verify the conformance of the given line of the Unicode
				297	* normalization (UTR 15) test suite file. For each line,
				298	* there are five columns, corresponding to field[0]..field[4].
				299	*
				300	* The following invariants must be true for all conformant implementations
				301	* c2 == NFC(c1) == NFC(c2) == NFC(c3)
				302	* c3 == NFD(c1) == NFD(c2) == NFD(c3)
				303	* c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
				304	* c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
				305	*
				306	* @param field the 5 columns
				307	* @param line the source line from the test suite file
				308	* @return true if the test passes
				309	*/
				310	UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
				311	const char *line,
				312	int32_t options,
				313	UErrorCode &status) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	314	UBool pass = true, result;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	315	UnicodeString out, fcd;
				316	int32_t fieldNum;
				317
				318	for (int32_t i=0; i<FIELD_COUNT; ++i) {
				319	fieldNum = i+1;
				320	if (i<3) {
				321	pass &= checkNorm(UNORM_NFC, options, nfc, field[i], field[1], fieldNum);
				322	pass &= checkNorm(UNORM_NFD, options, nfd, field[i], field[2], fieldNum);
				323	}
				324	pass &= checkNorm(UNORM_NFKC, options, nfkc, field[i], field[3], fieldNum);
				325	pass &= checkNorm(UNORM_NFKD, options, nfkd, field[i], field[4], fieldNum);
				326	}
				327	compare(field[1],field[2]);
				328	compare(field[0],field[1]);
				329	// test quick checks
				330	if(UNORM_NO == Normalizer::quickCheck(field[1], UNORM_NFC, options, status)) {
				331	errln("Normalizer error: quickCheck(NFC(s), UNORM_NFC) is UNORM_NO");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	332	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	333	}
				334	if(UNORM_NO == Normalizer::quickCheck(field[2], UNORM_NFD, options, status)) {
				335	errln("Normalizer error: quickCheck(NFD(s), UNORM_NFD) is UNORM_NO");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	336	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	337	}
				338	if(UNORM_NO == Normalizer::quickCheck(field[3], UNORM_NFKC, options, status)) {
				339	errln("Normalizer error: quickCheck(NFKC(s), UNORM_NFKC) is UNORM_NO");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	340	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	341	}
				342	if(UNORM_NO == Normalizer::quickCheck(field[4], UNORM_NFKD, options, status)) {
				343	errln("Normalizer error: quickCheck(NFKD(s), UNORM_NFKD) is UNORM_NO");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	344	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	345	}
				346
				347	// branch on options==0 for better code coverage
				348	if(options==0) {
				349	result = Normalizer::isNormalized(field[1], UNORM_NFC, status);
				350	} else {
				351	result = Normalizer::isNormalized(field[1], UNORM_NFC, options, status);
				352	}
				353	if(!result) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	354	dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is false");
				355	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	356	}
				357	if(options==0 && !isNormalizedUTF8(nfc, field[1], status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	358	dataerrln("Normalizer error: nfc.isNormalizedUTF8(NFC(s)) is false");
				359	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	360	}
				361	if(field[0]!=field[1]) {
				362	if(Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	363	errln("Normalizer error: isNormalized(s, UNORM_NFC) is true");
				364	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	365	}
				366	if(isNormalizedUTF8(nfc, field[0], status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	367	errln("Normalizer error: nfc.isNormalizedUTF8(s) is true");
				368	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	369	}
				370	}
				371	if(options==0 && !isNormalizedUTF8(nfd, field[2], status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	372	dataerrln("Normalizer error: nfd.isNormalizedUTF8(NFD(s)) is false");
				373	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	374	}
				375	if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	376	dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is false");
				377	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	378	} else {
				379	if(options==0 && !isNormalizedUTF8(nfkc, field[3], status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	380	dataerrln("Normalizer error: nfkc.isNormalizedUTF8(NFKC(s)) is false");
				381	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	382	}
				383	if(field[0]!=field[3]) {
				384	if(Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	385	errln("Normalizer error: isNormalized(s, UNORM_NFKC) is true");
				386	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	387	}
				388	if(options==0 && isNormalizedUTF8(nfkc, field[0], status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	389	errln("Normalizer error: nfkc.isNormalizedUTF8(s) is true");
				390	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	391	}
				392	}
				393	}
				394	if(options==0 && !isNormalizedUTF8(nfkd, field[4], status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	395	dataerrln("Normalizer error: nfkd.isNormalizedUTF8(NFKD(s)) is false");
				396	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	397	}
				398
				399	// test FCD quick check and "makeFCD"
				400	Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
				401	if(UNORM_NO == Normalizer::quickCheck(fcd, UNORM_FCD, options, status)) {
				402	errln("Normalizer error: quickCheck(FCD(s), UNORM_FCD) is UNORM_NO");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	403	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	404	}
				405	if(UNORM_NO == Normalizer::quickCheck(field[2], UNORM_FCD, options, status)) {
				406	errln("Normalizer error: quickCheck(NFD(s), UNORM_FCD) is UNORM_NO");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	407	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	408	}
				409	if(UNORM_NO == Normalizer::quickCheck(field[4], UNORM_FCD, options, status)) {
				410	errln("Normalizer error: quickCheck(NFKD(s), UNORM_FCD) is UNORM_NO");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	411	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	412	}
				413
				414	Normalizer::normalize(fcd, UNORM_NFD, options, out, status);
				415	if(out != field[2]) {
				416	dataerrln("Normalizer error: NFD(FCD(s))!=NFD(s)");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	417	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	418	}
				419
				420	if (U_FAILURE(status)) {
				421	dataerrln("Normalizer::normalize returned error status: %s", u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	422	pass = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	423	}
				424
				425	if(field[0]!=field[2]) {
				426	// two strings that are canonically equivalent must test
				427	// equal under a canonical caseless match
				428	// see UAX #21 Case Mappings and Jitterbug 2021 and
				429	// Unicode Technical Committee meeting consensus 92-C31
				430	int32_t rc;
				431
				432	status=U_ZERO_ERROR;
				433	rc=Normalizer::compare(field[0], field[2], (options<<UNORM_COMPARE_NORM_OPTIONS_SHIFT)\|U_COMPARE_IGNORE_CASE, status);
				434	if(U_FAILURE(status)) {
				435	dataerrln("Normalizer::compare(case-insensitive) sets %s", u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	436	pass=false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	437	} else if(rc!=0) {
				438	errln("Normalizer::compare(original, NFD, case-insensitive) returned %d instead of 0 for equal", rc);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	439	pass=false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	440	}
				441	}
				442
				443	if (!pass) {
				444	dataerrln("FAIL: %s", line);
				445	}
				446	return pass;
				447	}
				448
				449	static const char *const kModeStrings[UNORM_MODE_COUNT] = {
				450	"?", "none", "D", "KD", "C", "KC", "FCD"
				451	};
				452
				453	static const char *const kMessages[UNORM_MODE_COUNT] = {
				454	"?!=?", "?!=?", "c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)", "FCD"
				455	};
				456
				457	UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t options,
				458	const Normalizer2 *norm2,
				459	const UnicodeString &s, const UnicodeString &exp,
				460	int32_t field) {
				461	const char *modeString = kModeStrings[mode];
				462	char msg[20];
				463	snprintf(msg, sizeof(msg), kMessages[mode], field);
				464	UnicodeString out;
				465	UErrorCode errorCode = U_ZERO_ERROR;
				466	Normalizer::normalize(s, mode, options, out, errorCode);
				467	if (U_FAILURE(errorCode)) {
				468	dataerrln("Error running normalize UNORM_NF%s: %s", modeString, u_errorName(errorCode));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	469	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	470	}
				471	if (!assertEqual(modeString, "", s, out, exp, msg)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	472	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	473	}
				474
				475	iterativeNorm(s, mode, options, out, +1);
				476	if (!assertEqual(modeString, "(+1)", s, out, exp, msg)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	477	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	478	}
				479
				480	iterativeNorm(s, mode, options, out, -1);
				481	if (!assertEqual(modeString, "(-1)", s, out, exp, msg)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	482	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	483	}
				484
				485	if (norm2 == nullptr \|\| options != 0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	486	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	487	}
				488
				489	std::string s8;
				490	s.toUTF8String(s8);
				491	std::string exp8;
				492	exp.toUTF8String(exp8);
				493	std::string out8;
				494	Edits edits;
				495	Edits *editsPtr = mode != UNORM_FCD ? &edits : nullptr;
				496	StringByteSink<std::string> sink(&out8, static_cast<int32_t>(exp8.length()));
				497	norm2->normalizeUTF8(0, s8, sink, editsPtr, errorCode);
				498	if (U_FAILURE(errorCode)) {
				499	errln("Normalizer2.%s.normalizeUTF8(%s) failed: %s",
				500	modeString, s8.c_str(), u_errorName(errorCode));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	501	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	502	}
				503	if (out8 != exp8) {
				504	errln("Normalizer2.%s.normalizeUTF8(%s)=%s != %s",
				505	modeString, s8.c_str(), out8.c_str(), exp8.c_str());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	506	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	507	}
				508	if (editsPtr == nullptr) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	509	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	510	}
				511
				512	// Do the Edits cover the entire input & output?
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	513	UBool pass = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	514	pass &= assertEquals("edits.hasChanges()", (UBool)(s8 != out8), edits.hasChanges());
				515	pass &= assertEquals("edits.lengthDelta()",
				516	(int32_t)(out8.length() - s8.length()), edits.lengthDelta());
				517	Edits::Iterator iter = edits.getCoarseIterator();
				518	while (iter.next(errorCode)) {}
				519	pass &= assertEquals("edits source length", static_cast<int32_t>(s8.length()), iter.sourceIndex());
				520	pass &= assertEquals("edits destination length", static_cast<int32_t>(out8.length()), iter.destinationIndex());
				521	return pass;
				522	}
				523
				524	/**
				525	* Do a normalization using the iterative API in the given direction.
				526	* @param dir either +1 or -1
				527	*/
				528	void NormalizerConformanceTest::iterativeNorm(const UnicodeString& str,
				529	UNormalizationMode mode, int32_t options,
				530	UnicodeString& result,
				531	int8_t dir) {
				532	UErrorCode status = U_ZERO_ERROR;
				533	normalizer.setText(str, status);
				534	normalizer.setMode(mode);
				535	normalizer.setOption(-1, 0); // reset all options
				536	normalizer.setOption(options, 1); // set desired options
				537	result.truncate(0);
				538	if (U_FAILURE(status)) {
				539	return;
				540	}
				541	UChar32 ch;
				542	if (dir > 0) {
				543	for (ch = normalizer.first(); ch != Normalizer::DONE;
				544	ch = normalizer.next()) {
				545	result.append(ch);
				546	}
				547	} else {
				548	for (ch = normalizer.last(); ch != Normalizer::DONE;
				549	ch = normalizer.previous()) {
				550	result.insert(0, ch);
				551	}
				552	}
				553	}
				554
				555	UBool NormalizerConformanceTest::assertEqual(const char op, const char op2,
				556	const UnicodeString& s,
				557	const UnicodeString& got,
				558	const UnicodeString& exp,
				559	const char *msg) {
				560	if (exp == got)
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	561	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	562
				563	char sChars, gotChars, *expChars;
				564	UnicodeString sPretty(prettify(s));
				565	UnicodeString gotPretty(prettify(got));
				566	UnicodeString expPretty(prettify(exp));
				567
				568	sChars = new char[sPretty.length() + 1];
				569	gotChars = new char[gotPretty.length() + 1];
				570	expChars = new char[expPretty.length() + 1];
				571
				572	sPretty.extract(0, sPretty.length(), sChars, sPretty.length() + 1);
				573	sChars[sPretty.length()] = 0;
				574	gotPretty.extract(0, gotPretty.length(), gotChars, gotPretty.length() + 1);
				575	gotChars[gotPretty.length()] = 0;
				576	expPretty.extract(0, expPretty.length(), expChars, expPretty.length() + 1);
				577	expChars[expPretty.length()] = 0;
				578
				579	errln(" %s: %s%s(%s)=%s, exp. %s", msg, op, op2, sChars, gotChars, expChars);
				580
				581	delete []sChars;
				582	delete []gotChars;
				583	delete []expChars;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	584	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	585	}
				586
				587	/**
				588	* Split a string into pieces based on the given delimiter
				589	* character. Then, parse the resultant fields from hex into
				590	* characters. That is, "0040 0400;0C00;0899" -> new String[] {
				591	* "\u0040\u0400", "\u0C00", "\u0899" }. The output is assumed to
				592	* be of the proper length already, and exactly output.length
				593	* fields are parsed. If there are too few an exception is
				594	* thrown. If there are too many the extras are ignored.
				595	*
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	596	* @return false upon failure
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	597	*/
				598	UBool NormalizerConformanceTest::hexsplit(const char *s, char delimiter,
				599	UnicodeString output[], int32_t outputLength) {
				600	const char *t = s;
				601	char *end = NULL;
				602	UChar32 c;
				603	int32_t i;
				604	for (i=0; i<outputLength; ++i) {
				605	// skip whitespace
				606	while(t == ' ' \|\| t == '\t') {
				607	++t;
				608	}
				609
				610	// read a sequence of code points
				611	output[i].remove();
				612	for(;;) {
				613	c = (UChar32)uprv_strtoul(t, &end, 16);
				614
				615	if( (char *)t == end \|\|
				616	(uint32_t)c > 0x10ffff \|\|
				617	(end != ' ' && end != '\t' && *end != delimiter)
				618	) {
				619	errln(UnicodeString("Bad field ", "") + (i + 1) + " in " + UnicodeString(s, ""));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	620	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	621	}
				622
				623	output[i].append(c);
				624
				625	t = (const char *)end;
				626
				627	// skip whitespace
				628	while(t == ' ' \|\| t == '\t') {
				629	++t;
				630	}
				631
				632	if(*t == delimiter) {
				633	++t;
				634	break;
				635	}
				636	if(*t == 0) {
				637	if((i + 1) == outputLength) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	638	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	639	} else {
				640	errln(UnicodeString("Missing field(s) in ", "") + s + " only " + (i + 1) + " out of " + outputLength);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	641	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	642	}
				643	}
				644	}
				645	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	646	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	647	}
				648
				649	// Specific tests for debugging. These are generally failures taken from
				650	// the conformance file, but culled out to make debugging easier.
				651
				652	void NormalizerConformanceTest::TestCase6(void) {
				653	_testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
				654	}
				655
				656	void NormalizerConformanceTest::_testOneLine(const char *line) {
				657	UErrorCode status = U_ZERO_ERROR;
				658	UnicodeString fields[FIELD_COUNT];
				659	if (!hexsplit(line, ';', fields, FIELD_COUNT)) {
				660	errln((UnicodeString)"Unable to parse line " + line);
				661	} else {
				662	checkConformance(fields, line, 0, status);
				663	}
				664	}
				665
				666	#endif /* #if !UCONFIG_NO_NORMALIZATION */