Blame - source/test/intltest/csdetest.cpp - chromium.googlesource.com/chromium/deps/icu

blob: dc5e7a8699b1851d4d6ebb7891100d5c1aa32d5c [file] [log] [blame]

Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	**********************************************************************
				5	* Copyright (C) 2005-2016, International Business Machines
				6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	*/
				9
				10
				11	#include "unicode/utypes.h"
				12	#include "unicode/ucsdet.h"
				13	#include "unicode/ucnv.h"
				14	#include "unicode/unistr.h"
				15	#include "unicode/putil.h"
				16	#include "unicode/uniset.h"
				17
				18	#include "intltest.h"
				19	#include "csdetest.h"
				20
				21	#include "xmlparser.h"
				22
				23	#include <memory>
				24	#include <stdlib.h>
				25	#include <string.h>
				26
				27	#ifdef DEBUG_DETECT
				28	#include <stdio.h>
				29	#endif
				30
				31
				32	#define CH_SPACE 0x0020
				33	#define CH_SLASH 0x002F
				34
				35	#define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
				36	if (!(x)) { \
				37	errln("Failure in file %s, line %d", __FILE__, __LINE__); \
				38	} \
				39	} UPRV_BLOCK_MACRO_END
				40
				41	#define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
				42	if (U_FAILURE(errcode)) { \
				43	errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
				44	return; \
				45	} \
				46	} UPRV_BLOCK_MACRO_END
				47
				48
				49	//---------------------------------------------------------------------------
				50	//
				51	// Test class boilerplate
				52	//
				53	//---------------------------------------------------------------------------
				54	CharsetDetectionTest::CharsetDetectionTest()
				55	{
				56	}
				57
				58
				59	CharsetDetectionTest::~CharsetDetectionTest()
				60	{
				61	}
				62
				63
				64
				65	void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
				66	{
				67	if (exec) logln("TestSuite CharsetDetectionTest: ");
				68	switch (index) {
				69	case 0: name = "ConstructionTest";
				70	if (exec) ConstructionTest();
				71	break;
				72
				73	case 1: name = "UTF8Test";
				74	if (exec) UTF8Test();
				75	break;
				76
				77	case 2: name = "UTF16Test";
				78	if (exec) UTF16Test();
				79	break;
				80
				81	case 3: name = "C1BytesTest";
				82	if (exec) C1BytesTest();
				83	break;
				84
				85	case 4: name = "InputFilterTest";
				86	if (exec) InputFilterTest();
				87	break;
				88
				89	case 5: name = "DetectionTest";
				90	if (exec) DetectionTest();
				91	break;
				92	#if !UCONFIG_NO_LEGACY_CONVERSION
				93	case 6: name = "IBM424Test";
				94	if (exec) IBM424Test();
				95	break;
				96
				97	case 7: name = "IBM420Test";
				98	if (exec) IBM420Test();
				99	break;
				100	#else
				101	case 6:
				102	case 7: name = "skip"; break;
				103	#endif
				104	case 8: name = "Ticket6394Test";
				105	if (exec) Ticket6394Test();
				106	break;
				107
				108	case 9: name = "Ticket6954Test";
				109	if (exec) Ticket6954Test();
				110	break;
				111
Frank Tang	d2858cb	2022-04-08 20:34:12 -0700	[diff] [blame]	112	case 10: name = "Ticket21823Test";
				113	if (exec) Ticket21823Test();
				114	break;
				115
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	116	default: name = "";
				117	break; //needed to end loop
				118	}
				119	}
				120
				121	static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
				122	{
				123	int32_t offset = -1;
				124
				125	splits = 1;
				126	while((offset = src.indexOf(ch, offset + 1)) >= 0) {
				127	splits += 1;
				128	}
				129
				130	UnicodeString *result = new UnicodeString[splits];
				131
				132	int32_t start = 0;
				133	int32_t split = 0;
				134	int32_t end;
				135
				136	while((end = src.indexOf(ch, start)) >= 0) {
				137	src.extractBetween(start, end, result[split++]);
				138	start = end + 1;
				139	}
				140
				141	src.extractBetween(start, src.length(), result[split]);
				142
				143	return result;
				144	}
				145
				146	static char extractBytes(const UnicodeString &source, const char codepage, int32_t &length)
				147	{
				148	int32_t sLength = source.length();
				149	char *bytes = NULL;
				150
				151	length = source.extract(0, sLength, NULL, codepage);
				152
				153	if (length > 0) {
				154	bytes = new char[length + 1];
				155	source.extract(0, sLength, bytes, codepage);
				156	}
				157
				158	return bytes;
				159	}
				160
				161	void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
				162	{
				163	int32_t splits = 0;
				164	int32_t testLength = testString.length();
				165	std::unique_ptr<UnicodeString []> eSplit(split(encoding, CH_SLASH, splits));
				166	UErrorCode status = U_ZERO_ERROR;
				167	int32_t cpLength = eSplit[0].length();
				168	char codepage[64];
				169
				170	u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
				171	codepage[cpLength] = '\0';
				172
				173	LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
				174
				175	int32_t byteLength = 0;
				176	std::unique_ptr<char []> bytes(extractBytes(testString, codepage, byteLength));
				177
				178	if (! bytes) {
				179	#if !UCONFIG_NO_LEGACY_CONVERSION
				180	dataerrln("Can't open a " + encoding + " converter for " + id);
				181	#endif
				182	return;
				183	}
				184
				185	ucsdet_setText(csd.getAlias(), bytes.get(), byteLength, &status);
				186
				187	int32_t matchCount = 0;
				188	const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
				189
				190
				191	UnicodeString name(ucsdet_getName(matches[0], &status));
				192	UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
				193	UChar *decoded = NULL;
				194	int32_t dLength = 0;
				195
				196	if (matchCount == 0) {
				197	errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
				198	return;
				199	}
				200
				201	if (name.compare(eSplit[0]) != 0) {
				202	errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
				203
				204	#ifdef DEBUG_DETECT
				205	for (int32_t m = 0; m < matchCount; m += 1) {
				206	const char *name = ucsdet_getName(matches[m], &status);
				207	const char *lang = ucsdet_getLanguage(matches[m], &status);
				208	int32_t confidence = ucsdet_getConfidence(matches[m], &status);
				209
				210	printf("%s (%s) %d\n", name, lang, confidence);
				211	}
				212	#endif
				213	return;
				214	}
				215
				216	if (splits > 1 && lang.compare(eSplit[1]) != 0) {
				217	errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
				218	return;
				219	}
				220
				221	decoded = new UChar[testLength];
				222	dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
				223
				224	if (testString.compare(decoded, dLength) != 0) {
				225	errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yield the original string.");
				226
				227	#ifdef DEBUG_DETECT
				228	for(int32_t i = 0; i < testLength; i += 1) {
				229	if(testString[i] != decoded[i]) {
				230	printf("Strings differ at byte %d\n", i);
				231	break;
				232	}
				233	}
				234	#endif
				235
				236	}
				237
				238	delete[] decoded;
				239	}
				240
				241	const char CharsetDetectionTest::getPath(char buffer[2048], const char filename) {
				242	UErrorCode status = U_ZERO_ERROR;
				243	const char *testDataDirectory = IntlTest::getSourceTestData(status);
				244
				245	if (U_FAILURE(status)) {
				246	errln("ERROR: getPath() failed - %s", u_errorName(status));
				247	return NULL;
				248	}
				249
				250	strcpy(buffer, testDataDirectory);
				251	strcat(buffer, filename);
				252	return buffer;
				253	}
				254
				255	void CharsetDetectionTest::ConstructionTest()
				256	{
				257	IcuTestErrorCode status(*this, "ConstructionTest");
				258	LocalUCharsetDetectorPointer csd(ucsdet_open(status));
				259	LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
				260	int32_t count = uenum_count(e.getAlias(), status);
				261
				262	#ifdef DEBUG_DETECT
				263	printf("There are %d recognizers.\n", count);
				264	#endif
				265
				266	for(int32_t i = 0; i < count; i += 1) {
				267	int32_t length;
				268	const char *name = uenum_next(e.getAlias(), &length, status);
				269
				270	if(name == NULL \|\| length <= 0) {
				271	errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
				272	}
				273
				274	#ifdef DEBUG_DETECT
				275	printf("%s\n", name);
				276	#endif
				277	}
				278
				279	const char* defDisabled[] = {
				280	"IBM420_rtl", "IBM420_ltr",
				281	"IBM424_rtl", "IBM424_ltr",
				282	0
				283	};
				284
				285	LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
				286	const char *activeName = NULL;
				287
				288	while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
				289	// the charset must be included in all list
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	290	UBool found = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	291
				292	const char *name = NULL;
				293	uenum_reset(e.getAlias(), status);
				294	while ((name = uenum_next(e.getAlias(), NULL, status))) {
				295	if (strcmp(activeName, name) == 0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	296	found = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	297	break;
				298	}
				299	}
				300
				301	if (!found) {
				302	errln(UnicodeString(activeName) + " is not included in the all charset list.");
				303	}
				304
				305	// some charsets are disabled by default
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	306	found = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	307	for (int32_t i = 0; defDisabled[i] != 0; i++) {
				308	if (strcmp(activeName, defDisabled[i]) == 0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	309	found = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	310	break;
				311	}
				312	}
				313	if (found) {
				314	errln(UnicodeString(activeName) + " should not be included in the default charset list.");
				315	}
				316	}
				317	}
				318
				319	void CharsetDetectionTest::UTF8Test()
				320	{
				321	UErrorCode status = U_ZERO_ERROR;
				322	UnicodeString ss = "This is a string with some non-ascii characters that will "
				323	"be converted to UTF-8, then shoved through the detection process. "
				324	"\\u0391\\u0392\\u0393\\u0394\\u0395"
				325	"Sure would be nice if our source could contain Unicode directly!";
				326	UnicodeString s = ss.unescape();
				327	int32_t byteLength = 0, sLength = s.length();
				328	char *bytes = extractBytes(s, "UTF-8", byteLength);
				329	UCharsetDetector *csd = ucsdet_open(&status);
				330	const UCharsetMatch *match;
				331	UChar *detected = new UChar[sLength];
				332
				333	ucsdet_setText(csd, bytes, byteLength, &status);
				334	match = ucsdet_detect(csd, &status);
				335
				336	if (match == NULL) {
				337	errln("Detection failure for UTF-8: got no matches.");
				338	goto bail;
				339	}
				340
				341	ucsdet_getUChars(match, detected, sLength, &status);
				342
				343	if (s.compare(detected, sLength) != 0) {
				344	errln("Round-trip test failed!");
				345	}
				346
				347	ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
				348
				349	bail:
				350	delete[] detected;
				351	delete[] bytes;
				352	ucsdet_close(csd);
				353	}
				354
				355	void CharsetDetectionTest::UTF16Test()
				356	{
				357	UErrorCode status = U_ZERO_ERROR;
				358	/* Notice the BOM on the start of this string */
				359	UChar chars[] = {
				360	0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
				361	0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
				362	0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
				363	0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
				364	0x064a, 0x062a, 0x0000};
				365	UnicodeString s(chars);
				366	int32_t beLength = 0, leLength = 0;
				367	std::unique_ptr<char []>beBytes(extractBytes(s, "UTF-16BE", beLength));
				368	std::unique_ptr<char []>leBytes(extractBytes(s, "UTF-16LE", leLength));
				369	LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
				370	const UCharsetMatch *match;
				371	const char *name;
				372	int32_t conf;
				373
				374	ucsdet_setText(csd.getAlias(), beBytes.get(), beLength, &status);
				375	match = ucsdet_detect(csd.getAlias(), &status);
				376
				377	if (match == NULL) {
				378	errln("Encoding detection failure for UTF-16BE: got no matches.");
				379	} else {
				380
				381	name = ucsdet_getName(match, &status);
				382	conf = ucsdet_getConfidence(match, &status);
				383
				384	if (strcmp(name, "UTF-16BE") != 0) {
				385	errln("Encoding detection failure for UTF-16BE: got %s", name);
				386	} else if (conf != 100) {
				387	errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
				388	}
				389	}
				390
				391	ucsdet_setText(csd.getAlias(), leBytes.get(), leLength, &status);
				392	match = ucsdet_detect(csd.getAlias(), &status);
				393
				394	if (match == NULL) {
				395	errln("Encoding detection failure for UTF-16LE: got no matches.");
				396	return;
				397	}
				398
				399	name = ucsdet_getName(match, &status);
				400	conf = ucsdet_getConfidence(match, &status);
				401
				402	if (strcmp(name, "UTF-16LE") != 0) {
				403	errln("Encoding detection failure for UTF-16LE: got %s", name);
				404	return;
				405	}
				406
				407	if (conf != 100) {
				408	errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
				409	}
				410	}
				411
				412	void CharsetDetectionTest::InputFilterTest()
				413	{
				414	UErrorCode status = U_ZERO_ERROR;
				415	UnicodeString s(u"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>");
				416	int32_t byteLength = 0;
				417	char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
				418	UCharsetDetector *csd = ucsdet_open(&status);
				419	const UCharsetMatch *match;
				420	const char lang, name;
				421
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	422	ucsdet_enableInputFilter(csd, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	423
				424	if (!ucsdet_isInputFilterEnabled(csd)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	425	errln("ucsdet_enableInputFilter(csd, true) did not enable input filter!");
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	426	}
				427
				428
				429	ucsdet_setText(csd, bytes, byteLength, &status);
				430	match = ucsdet_detect(csd, &status);
				431
				432	if (match == NULL) {
				433	errln("Turning on the input filter resulted in no matches.");
				434	goto turn_off;
				435	}
				436
				437	name = ucsdet_getName(match, &status);
				438
				439	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
				440	errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
				441	} else {
				442	lang = ucsdet_getLanguage(match, &status);
				443
				444	if (lang == NULL \|\| strcmp(lang, "fr") != 0) {
				445	errln("Input filter did not strip markup!");
				446	}
				447	}
				448
				449	turn_off:
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	450	ucsdet_enableInputFilter(csd, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	451	ucsdet_setText(csd, bytes, byteLength, &status);
				452	match = ucsdet_detect(csd, &status);
				453
				454	if (match == NULL) {
				455	errln("Turning off the input filter resulted in no matches.");
				456	goto bail;
				457	}
				458
				459	name = ucsdet_getName(match, &status);
				460
				461	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
				462	errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
				463	} else {
				464	lang = ucsdet_getLanguage(match, &status);
				465
				466	if (lang == NULL \|\| strcmp(lang, "en") != 0) {
				467	errln("Unfiltered input did not detect as English!");
				468	}
				469	}
				470
				471	bail:
				472	delete[] bytes;
				473	ucsdet_close(csd);
				474	}
				475
				476	void CharsetDetectionTest::C1BytesTest()
				477	{
				478	#if !UCONFIG_NO_LEGACY_CONVERSION
				479	UErrorCode status = U_ZERO_ERROR;
				480	UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
				481	UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
				482	UnicodeString sWindows = ssWindows.unescape();
				483	int32_t lISO = 0, lWindows = 0;
				484	char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
				485	char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
				486	UCharsetDetector *csd = ucsdet_open(&status);
				487	const UCharsetMatch *match;
				488	const char *name;
				489
				490	ucsdet_setText(csd, bWindows, lWindows, &status);
				491	match = ucsdet_detect(csd, &status);
				492
				493	if (match == NULL) {
				494	errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
				495	goto bail;
				496	}
				497
				498	name = ucsdet_getName(match, &status);
				499
				500	if (strcmp(name, "windows-1252") != 0) {
				501	errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
				502	}
				503
				504	ucsdet_setText(csd, bISO, lISO, &status);
				505	match = ucsdet_detect(csd, &status);
				506
				507	if (match == NULL) {
				508	errln("English text without C1 bytes got no matches.");
				509	goto bail;
				510	}
				511
				512	name = ucsdet_getName(match, &status);
				513
				514	if (strcmp(name, "ISO-8859-1") != 0) {
				515	errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
				516	}
				517
				518	bail:
				519	delete[] bWindows;
				520	delete[] bISO;
				521
				522	ucsdet_close(csd);
				523	#endif
				524	}
				525
				526	void CharsetDetectionTest::DetectionTest()
				527	{
				528	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				529	UErrorCode status = U_ZERO_ERROR;
				530	char path[2048];
				531	const char *testFilePath = getPath(path, "csdetest.xml");
				532
				533	if (testFilePath == NULL) {
				534	return; /* Couldn't get path: error message already output. */
				535	}
				536
				537	UXMLParser *parser = UXMLParser::createParser(status);
				538	if (U_FAILURE(status)) {
				539	dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
				540	return;
				541	}
				542
				543	UXMLElement *root = parser->parseFile(testFilePath, status);
				544	if (!assertSuccess( "parseFile",status)) return;
				545
				546	UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
				547	UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
				548	UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
				549
				550	const UXMLElement *testCase;
				551	int32_t tc = 0;
				552
				553	while((testCase = root->nextChildElement(tc)) != NULL) {
				554	if (testCase->getTagName().compare(test_case) == 0) {
				555	const UnicodeString *id = testCase->getAttribute(id_attr);
				556	const UnicodeString *encodings = testCase->getAttribute(enc_attr);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	557	const UnicodeString text = testCase->getText(true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	558	int32_t encodingCount;
				559	UnicodeString encodingList = split(encodings, CH_SPACE, encodingCount);
				560
				561	for(int32_t e = 0; e < encodingCount; e += 1) {
				562	checkEncoding(text, encodingList[e], *id);
				563	}
				564
				565	delete[] encodingList;
				566	}
				567	}
				568
				569	delete root;
				570	delete parser;
				571	#endif
				572	}
				573
				574	void CharsetDetectionTest::IBM424Test()
				575	{
				576	#if !UCONFIG_ONLY_HTML_CONVERSION
				577	UErrorCode status = U_ZERO_ERROR;
				578
				579	static const UChar chars[] = {
				580	0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
				581	0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
				582	0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
				583	0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
				584	0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
				585	0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
				586	0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
				587	0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
				588	0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
				589	0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
				590	0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
				591	0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
				592	0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
				593	0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
				594	0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
				595	0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
				596	0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
				597	};
				598
				599	static const UChar chars_reverse[] = {
				600	0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
				601	0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
				602	0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
				603	0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
				604	0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
				605	0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
				606	0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
				607	0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
				608	0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
				609	0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
				610	0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
				611	0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
				612	0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
				613	0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
				614	0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
				615	0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
				616	0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
				617	0x0000
				618	};
				619
				620	int32_t bLength = 0, brLength = 0;
				621
				622	UnicodeString s1(chars);
				623	UnicodeString s2(chars_reverse);
				624
				625	char *bytes = extractBytes(s1, "IBM424", bLength);
				626	char *bytes_r = extractBytes(s2, "IBM424", brLength);
				627
				628	UCharsetDetector *csd = ucsdet_open(&status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	629	ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status);
				630	ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status);
				631	ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status);
				632	ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	633	if (U_FAILURE(status)) {
				634	errln("Error opening charset detector. - %s", u_errorName(status));
				635	}
				636	const UCharsetMatch *match;
				637	const char *name;
				638
				639	ucsdet_setText(csd, bytes, bLength, &status);
				640	match = ucsdet_detect(csd, &status);
				641
				642	if (match == NULL) {
				643	errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
				644	goto bail;
				645	}
				646
				647	name = ucsdet_getName(match, &status);
				648	if (strcmp(name, "IBM424_rtl") != 0) {
				649	errln("Encoding detection failure for IBM424_rtl: got %s", name);
				650	}
				651
				652	ucsdet_setText(csd, bytes_r, brLength, &status);
				653	match = ucsdet_detect(csd, &status);
				654
				655	if (match == NULL) {
				656	errln("Encoding detection failure for IBM424_ltr: got no matches.");
				657	goto bail;
				658	}
				659
				660	name = ucsdet_getName(match, &status);
				661	if (strcmp(name, "IBM424_ltr") != 0) {
				662	errln("Encoding detection failure for IBM424_ltr: got %s", name);
				663	}
				664
				665	bail:
				666	delete[] bytes;
				667	delete[] bytes_r;
				668	ucsdet_close(csd);
				669	#endif
				670	}
				671
				672	void CharsetDetectionTest::IBM420Test()
				673	{
				674	#if !UCONFIG_ONLY_HTML_CONVERSION
				675	UErrorCode status = U_ZERO_ERROR;
				676
				677	static const UChar chars[] = {
				678	0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
				679	0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
				680	0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
				681	0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
				682	0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
				683	0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
				684	0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
				685	0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
				686	0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
				687	0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
				688	0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
				689	0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
				690	0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
				691	0x0000
				692	};
				693	static const UChar chars_reverse[] = {
				694	0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
				695	0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
				696	0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
				697	0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
				698	0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
				699	0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
				700	0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
				701	0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
				702	0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
				703	0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
				704	0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
				705	0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
				706	0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
				707	0x0000,
				708	};
				709
				710	int32_t bLength = 0, brLength = 0;
				711
				712	UnicodeString s1(chars);
				713	UnicodeString s2(chars_reverse);
				714
				715	char *bytes = extractBytes(s1, "IBM420", bLength);
				716	char *bytes_r = extractBytes(s2, "IBM420", brLength);
				717
				718	UCharsetDetector *csd = ucsdet_open(&status);
				719	if (U_FAILURE(status)) {
				720	errln("Error opening charset detector. - %s", u_errorName(status));
				721	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	722	ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status);
				723	ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status);
				724	ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status);
				725	ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	726	const UCharsetMatch *match;
				727	const char *name;
				728
				729	ucsdet_setText(csd, bytes, bLength, &status);
				730	match = ucsdet_detect(csd, &status);
				731
				732	if (match == NULL) {
				733	errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
				734	goto bail;
				735	}
				736
				737	name = ucsdet_getName(match, &status);
				738	if (strcmp(name, "IBM420_rtl") != 0) {
				739	errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
				740	}
				741
				742	ucsdet_setText(csd, bytes_r, brLength, &status);
				743	match = ucsdet_detect(csd, &status);
				744
				745	if (match == NULL) {
				746	errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
				747	goto bail;
				748	}
				749
				750	name = ucsdet_getName(match, &status);
				751	if (strcmp(name, "IBM420_ltr") != 0) {
				752	errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
				753	}
				754
				755	bail:
				756	delete[] bytes;
				757	delete[] bytes_r;
				758	ucsdet_close(csd);
				759	#endif
				760	}
				761
				762
				763	void CharsetDetectionTest::Ticket6394Test() {
				764	#if !UCONFIG_NO_CONVERSION
				765	const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
				766	"Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
				767	"encodings more than once. The hop through UnicodeString is for platforms "
				768	"where this char * string is be EBCDIC and needs conversion to Latin1.";
				769	char latin1Text[sizeof(charText)];
				770	UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
				771
				772	UErrorCode status = U_ZERO_ERROR;
				773	UCharsetDetector *csd = ucsdet_open(&status);
				774	ucsdet_setText(csd, latin1Text, -1, &status);
				775	if (U_FAILURE(status)) {
				776	errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
				777	return;
				778	}
				779
				780	int32_t matchCount = 0;
				781	const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
				782	if (U_FAILURE(status)) {
				783	errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
				784	return;
				785	}
				786
				787	UnicodeSet setOfCharsetNames; // UnicodeSets can hold strings.
				788	int32_t i;
				789	for (i=0; i<matchCount; i++) {
				790	UnicodeString charSetName(ucsdet_getName(matches[i], &status));
				791	if (U_FAILURE(status)) {
				792	errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
				793	status = U_ZERO_ERROR;
				794	}
				795	if (setOfCharsetNames.contains(charSetName)) {
				796	errln("Fail at file %s, line %d ", __FILE__, __LINE__);
				797	errln(UnicodeString(" Duplicate charset name = ") + charSetName);
				798	}
				799	setOfCharsetNames.add(charSetName);
				800	}
				801	ucsdet_close(csd);
				802	#endif
				803	}
				804
				805
				806	// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
				807	// similar Windows and non-Windows SBCS encodings. State was kept in the shared
				808	// Charset Recognizer objects, and could be overwritten.
				809	void CharsetDetectionTest::Ticket6954Test() {
				810	#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
				811	UErrorCode status = U_ZERO_ERROR;
				812	UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
				813	UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
				814	"It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
				815	UnicodeString sWindows = ssWindows.unescape();
				816	int32_t lISO = 0, lWindows = 0;
				817	std::unique_ptr<char[]> bISO(extractBytes(sISO, "ISO-8859-1", lISO));
				818	std::unique_ptr<char[]> bWindows(extractBytes(sWindows, "windows-1252", lWindows));
				819
				820	// First do a plain vanilla detect of 1252 text
				821
				822	LocalUCharsetDetectorPointer csd1(ucsdet_open(&status));
				823	ucsdet_setText(csd1.getAlias(), bWindows.get(), lWindows, &status);
				824	const UCharsetMatch *match1 = ucsdet_detect(csd1.getAlias(), &status);
				825	const char *name1 = ucsdet_getName(match1, &status);
				826	TEST_ASSERT_SUCCESS(status);
				827	TEST_ASSERT(strcmp(name1, "windows-1252")==0);
				828
				829	// Next, using a completely separate detector, detect some 8859-1 text
				830
				831	LocalUCharsetDetectorPointer csd2(ucsdet_open(&status));
				832	ucsdet_setText(csd2.getAlias(), bISO.get(), lISO, &status);
				833	const UCharsetMatch *match2 = ucsdet_detect(csd2.getAlias(), &status);
				834	const char *name2 = ucsdet_getName(match2, &status);
				835	TEST_ASSERT_SUCCESS(status);
				836	TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
				837
				838	// Recheck the 1252 results from the first detector, which should not have been
				839	// altered by the use of a different detector.
				840
				841	name1 = ucsdet_getName(match1, &status);
				842	TEST_ASSERT_SUCCESS(status);
				843	TEST_ASSERT(strcmp(name1, "windows-1252")==0);
				844	#endif
				845	}
Frank Tang	d2858cb	2022-04-08 20:34:12 -0700	[diff] [blame]	846
				847
				848	// Ticket 21823 - Issue with Charset Detector for ill-formed input strings.
				849	// Its fix involves returning a failure based error code
				850	// (U_INVALID_CHAR_FOUND) incase no charsets appear to match the input data.
				851	void CharsetDetectionTest::Ticket21823Test() {
				852	UErrorCode status = U_ZERO_ERROR;
				853	std::string str = "\x80";
				854	UCharsetDetector* csd = ucsdet_open(&status);
				855
				856	ucsdet_setText(csd, str.data(), str.length(), &status);
				857	const UCharsetMatch* match = ucsdet_detect(csd, &status);
				858
				859	if (match == NULL) {
				860	TEST_ASSERT(U_FAILURE(status));
				861	}
				862
				863	ucsdet_close(csd);
				864	}