Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ********************************************************************** |
| 5 | * Copyright (C) 2005-2016, International Business Machines |
| 6 | * Corporation and others. All Rights Reserved. |
| 7 | ********************************************************************** |
| 8 | */ |
| 9 | |
| 10 | |
| 11 | #include "unicode/utypes.h" |
| 12 | #include "unicode/ucsdet.h" |
| 13 | #include "unicode/ucnv.h" |
| 14 | #include "unicode/unistr.h" |
| 15 | #include "unicode/putil.h" |
| 16 | #include "unicode/uniset.h" |
| 17 | |
| 18 | #include "intltest.h" |
| 19 | #include "csdetest.h" |
| 20 | |
| 21 | #include "xmlparser.h" |
| 22 | |
| 23 | #include <memory> |
| 24 | #include <stdlib.h> |
| 25 | #include <string.h> |
| 26 | |
| 27 | #ifdef DEBUG_DETECT |
| 28 | #include <stdio.h> |
| 29 | #endif |
| 30 | |
| 31 | |
| 32 | #define CH_SPACE 0x0020 |
| 33 | #define CH_SLASH 0x002F |
| 34 | |
| 35 | #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \ |
| 36 | if (!(x)) { \ |
| 37 | errln("Failure in file %s, line %d", __FILE__, __LINE__); \ |
| 38 | } \ |
| 39 | } UPRV_BLOCK_MACRO_END |
| 40 | |
| 41 | #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \ |
| 42 | if (U_FAILURE(errcode)) { \ |
| 43 | errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \ |
| 44 | return; \ |
| 45 | } \ |
| 46 | } UPRV_BLOCK_MACRO_END |
| 47 | |
| 48 | |
| 49 | //--------------------------------------------------------------------------- |
| 50 | // |
| 51 | // Test class boilerplate |
| 52 | // |
| 53 | //--------------------------------------------------------------------------- |
| 54 | CharsetDetectionTest::CharsetDetectionTest() |
| 55 | { |
| 56 | } |
| 57 | |
| 58 | |
| 59 | CharsetDetectionTest::~CharsetDetectionTest() |
| 60 | { |
| 61 | } |
| 62 | |
| 63 | |
| 64 | |
| 65 | void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) |
| 66 | { |
| 67 | if (exec) logln("TestSuite CharsetDetectionTest: "); |
| 68 | switch (index) { |
| 69 | case 0: name = "ConstructionTest"; |
| 70 | if (exec) ConstructionTest(); |
| 71 | break; |
| 72 | |
| 73 | case 1: name = "UTF8Test"; |
| 74 | if (exec) UTF8Test(); |
| 75 | break; |
| 76 | |
| 77 | case 2: name = "UTF16Test"; |
| 78 | if (exec) UTF16Test(); |
| 79 | break; |
| 80 | |
| 81 | case 3: name = "C1BytesTest"; |
| 82 | if (exec) C1BytesTest(); |
| 83 | break; |
| 84 | |
| 85 | case 4: name = "InputFilterTest"; |
| 86 | if (exec) InputFilterTest(); |
| 87 | break; |
| 88 | |
| 89 | case 5: name = "DetectionTest"; |
| 90 | if (exec) DetectionTest(); |
| 91 | break; |
| 92 | #if !UCONFIG_NO_LEGACY_CONVERSION |
| 93 | case 6: name = "IBM424Test"; |
| 94 | if (exec) IBM424Test(); |
| 95 | break; |
| 96 | |
| 97 | case 7: name = "IBM420Test"; |
| 98 | if (exec) IBM420Test(); |
| 99 | break; |
| 100 | #else |
| 101 | case 6: |
| 102 | case 7: name = "skip"; break; |
| 103 | #endif |
| 104 | case 8: name = "Ticket6394Test"; |
| 105 | if (exec) Ticket6394Test(); |
| 106 | break; |
| 107 | |
| 108 | case 9: name = "Ticket6954Test"; |
| 109 | if (exec) Ticket6954Test(); |
| 110 | break; |
| 111 | |
Frank Tang | d2858cb | 2022-04-08 20:34:12 -0700 | [diff] [blame] | 112 | case 10: name = "Ticket21823Test"; |
| 113 | if (exec) Ticket21823Test(); |
| 114 | break; |
| 115 | |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 116 | default: name = ""; |
| 117 | break; //needed to end loop |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) |
| 122 | { |
| 123 | int32_t offset = -1; |
| 124 | |
| 125 | splits = 1; |
| 126 | while((offset = src.indexOf(ch, offset + 1)) >= 0) { |
| 127 | splits += 1; |
| 128 | } |
| 129 | |
| 130 | UnicodeString *result = new UnicodeString[splits]; |
| 131 | |
| 132 | int32_t start = 0; |
| 133 | int32_t split = 0; |
| 134 | int32_t end; |
| 135 | |
| 136 | while((end = src.indexOf(ch, start)) >= 0) { |
| 137 | src.extractBetween(start, end, result[split++]); |
| 138 | start = end + 1; |
| 139 | } |
| 140 | |
| 141 | src.extractBetween(start, src.length(), result[split]); |
| 142 | |
| 143 | return result; |
| 144 | } |
| 145 | |
| 146 | static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length) |
| 147 | { |
| 148 | int32_t sLength = source.length(); |
| 149 | char *bytes = NULL; |
| 150 | |
| 151 | length = source.extract(0, sLength, NULL, codepage); |
| 152 | |
| 153 | if (length > 0) { |
| 154 | bytes = new char[length + 1]; |
| 155 | source.extract(0, sLength, bytes, codepage); |
| 156 | } |
| 157 | |
| 158 | return bytes; |
| 159 | } |
| 160 | |
| 161 | void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id) |
| 162 | { |
| 163 | int32_t splits = 0; |
| 164 | int32_t testLength = testString.length(); |
| 165 | std::unique_ptr<UnicodeString []> eSplit(split(encoding, CH_SLASH, splits)); |
| 166 | UErrorCode status = U_ZERO_ERROR; |
| 167 | int32_t cpLength = eSplit[0].length(); |
| 168 | char codepage[64]; |
| 169 | |
| 170 | u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); |
| 171 | codepage[cpLength] = '\0'; |
| 172 | |
| 173 | LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); |
| 174 | |
| 175 | int32_t byteLength = 0; |
| 176 | std::unique_ptr<char []> bytes(extractBytes(testString, codepage, byteLength)); |
| 177 | |
| 178 | if (! bytes) { |
| 179 | #if !UCONFIG_NO_LEGACY_CONVERSION |
| 180 | dataerrln("Can't open a " + encoding + " converter for " + id); |
| 181 | #endif |
| 182 | return; |
| 183 | } |
| 184 | |
| 185 | ucsdet_setText(csd.getAlias(), bytes.get(), byteLength, &status); |
| 186 | |
| 187 | int32_t matchCount = 0; |
| 188 | const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status); |
| 189 | |
| 190 | |
| 191 | UnicodeString name(ucsdet_getName(matches[0], &status)); |
| 192 | UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); |
| 193 | UChar *decoded = NULL; |
| 194 | int32_t dLength = 0; |
| 195 | |
| 196 | if (matchCount == 0) { |
| 197 | errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches"); |
| 198 | return; |
| 199 | } |
| 200 | |
| 201 | if (name.compare(eSplit[0]) != 0) { |
| 202 | errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); |
| 203 | |
| 204 | #ifdef DEBUG_DETECT |
| 205 | for (int32_t m = 0; m < matchCount; m += 1) { |
| 206 | const char *name = ucsdet_getName(matches[m], &status); |
| 207 | const char *lang = ucsdet_getLanguage(matches[m], &status); |
| 208 | int32_t confidence = ucsdet_getConfidence(matches[m], &status); |
| 209 | |
| 210 | printf("%s (%s) %d\n", name, lang, confidence); |
| 211 | } |
| 212 | #endif |
| 213 | return; |
| 214 | } |
| 215 | |
| 216 | if (splits > 1 && lang.compare(eSplit[1]) != 0) { |
| 217 | errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang); |
| 218 | return; |
| 219 | } |
| 220 | |
| 221 | decoded = new UChar[testLength]; |
| 222 | dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); |
| 223 | |
| 224 | if (testString.compare(decoded, dLength) != 0) { |
| 225 | errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yield the original string."); |
| 226 | |
| 227 | #ifdef DEBUG_DETECT |
| 228 | for(int32_t i = 0; i < testLength; i += 1) { |
| 229 | if(testString[i] != decoded[i]) { |
| 230 | printf("Strings differ at byte %d\n", i); |
| 231 | break; |
| 232 | } |
| 233 | } |
| 234 | #endif |
| 235 | |
| 236 | } |
| 237 | |
| 238 | delete[] decoded; |
| 239 | } |
| 240 | |
| 241 | const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) { |
| 242 | UErrorCode status = U_ZERO_ERROR; |
| 243 | const char *testDataDirectory = IntlTest::getSourceTestData(status); |
| 244 | |
| 245 | if (U_FAILURE(status)) { |
| 246 | errln("ERROR: getPath() failed - %s", u_errorName(status)); |
| 247 | return NULL; |
| 248 | } |
| 249 | |
| 250 | strcpy(buffer, testDataDirectory); |
| 251 | strcat(buffer, filename); |
| 252 | return buffer; |
| 253 | } |
| 254 | |
| 255 | void CharsetDetectionTest::ConstructionTest() |
| 256 | { |
| 257 | IcuTestErrorCode status(*this, "ConstructionTest"); |
| 258 | LocalUCharsetDetectorPointer csd(ucsdet_open(status)); |
| 259 | LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status)); |
| 260 | int32_t count = uenum_count(e.getAlias(), status); |
| 261 | |
| 262 | #ifdef DEBUG_DETECT |
| 263 | printf("There are %d recognizers.\n", count); |
| 264 | #endif |
| 265 | |
| 266 | for(int32_t i = 0; i < count; i += 1) { |
| 267 | int32_t length; |
| 268 | const char *name = uenum_next(e.getAlias(), &length, status); |
| 269 | |
| 270 | if(name == NULL || length <= 0) { |
| 271 | errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); |
| 272 | } |
| 273 | |
| 274 | #ifdef DEBUG_DETECT |
| 275 | printf("%s\n", name); |
| 276 | #endif |
| 277 | } |
| 278 | |
| 279 | const char* defDisabled[] = { |
| 280 | "IBM420_rtl", "IBM420_ltr", |
| 281 | "IBM424_rtl", "IBM424_ltr", |
| 282 | 0 |
| 283 | }; |
| 284 | |
| 285 | LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status)); |
| 286 | const char *activeName = NULL; |
| 287 | |
| 288 | while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) { |
| 289 | // the charset must be included in all list |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 290 | UBool found = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 291 | |
| 292 | const char *name = NULL; |
| 293 | uenum_reset(e.getAlias(), status); |
| 294 | while ((name = uenum_next(e.getAlias(), NULL, status))) { |
| 295 | if (strcmp(activeName, name) == 0) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 296 | found = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 297 | break; |
| 298 | } |
| 299 | } |
| 300 | |
| 301 | if (!found) { |
| 302 | errln(UnicodeString(activeName) + " is not included in the all charset list."); |
| 303 | } |
| 304 | |
| 305 | // some charsets are disabled by default |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 306 | found = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 307 | for (int32_t i = 0; defDisabled[i] != 0; i++) { |
| 308 | if (strcmp(activeName, defDisabled[i]) == 0) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 309 | found = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 310 | break; |
| 311 | } |
| 312 | } |
| 313 | if (found) { |
| 314 | errln(UnicodeString(activeName) + " should not be included in the default charset list."); |
| 315 | } |
| 316 | } |
| 317 | } |
| 318 | |
| 319 | void CharsetDetectionTest::UTF8Test() |
| 320 | { |
| 321 | UErrorCode status = U_ZERO_ERROR; |
| 322 | UnicodeString ss = "This is a string with some non-ascii characters that will " |
| 323 | "be converted to UTF-8, then shoved through the detection process. " |
| 324 | "\\u0391\\u0392\\u0393\\u0394\\u0395" |
| 325 | "Sure would be nice if our source could contain Unicode directly!"; |
| 326 | UnicodeString s = ss.unescape(); |
| 327 | int32_t byteLength = 0, sLength = s.length(); |
| 328 | char *bytes = extractBytes(s, "UTF-8", byteLength); |
| 329 | UCharsetDetector *csd = ucsdet_open(&status); |
| 330 | const UCharsetMatch *match; |
| 331 | UChar *detected = new UChar[sLength]; |
| 332 | |
| 333 | ucsdet_setText(csd, bytes, byteLength, &status); |
| 334 | match = ucsdet_detect(csd, &status); |
| 335 | |
| 336 | if (match == NULL) { |
| 337 | errln("Detection failure for UTF-8: got no matches."); |
| 338 | goto bail; |
| 339 | } |
| 340 | |
| 341 | ucsdet_getUChars(match, detected, sLength, &status); |
| 342 | |
| 343 | if (s.compare(detected, sLength) != 0) { |
| 344 | errln("Round-trip test failed!"); |
| 345 | } |
| 346 | |
| 347 | ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ |
| 348 | |
| 349 | bail: |
| 350 | delete[] detected; |
| 351 | delete[] bytes; |
| 352 | ucsdet_close(csd); |
| 353 | } |
| 354 | |
| 355 | void CharsetDetectionTest::UTF16Test() |
| 356 | { |
| 357 | UErrorCode status = U_ZERO_ERROR; |
| 358 | /* Notice the BOM on the start of this string */ |
| 359 | UChar chars[] = { |
| 360 | 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, |
| 361 | 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, |
| 362 | 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, |
| 363 | 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, |
| 364 | 0x064a, 0x062a, 0x0000}; |
| 365 | UnicodeString s(chars); |
| 366 | int32_t beLength = 0, leLength = 0; |
| 367 | std::unique_ptr<char []>beBytes(extractBytes(s, "UTF-16BE", beLength)); |
| 368 | std::unique_ptr<char []>leBytes(extractBytes(s, "UTF-16LE", leLength)); |
| 369 | LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); |
| 370 | const UCharsetMatch *match; |
| 371 | const char *name; |
| 372 | int32_t conf; |
| 373 | |
| 374 | ucsdet_setText(csd.getAlias(), beBytes.get(), beLength, &status); |
| 375 | match = ucsdet_detect(csd.getAlias(), &status); |
| 376 | |
| 377 | if (match == NULL) { |
| 378 | errln("Encoding detection failure for UTF-16BE: got no matches."); |
| 379 | } else { |
| 380 | |
| 381 | name = ucsdet_getName(match, &status); |
| 382 | conf = ucsdet_getConfidence(match, &status); |
| 383 | |
| 384 | if (strcmp(name, "UTF-16BE") != 0) { |
| 385 | errln("Encoding detection failure for UTF-16BE: got %s", name); |
| 386 | } else if (conf != 100) { |
| 387 | errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); |
| 388 | } |
| 389 | } |
| 390 | |
| 391 | ucsdet_setText(csd.getAlias(), leBytes.get(), leLength, &status); |
| 392 | match = ucsdet_detect(csd.getAlias(), &status); |
| 393 | |
| 394 | if (match == NULL) { |
| 395 | errln("Encoding detection failure for UTF-16LE: got no matches."); |
| 396 | return; |
| 397 | } |
| 398 | |
| 399 | name = ucsdet_getName(match, &status); |
| 400 | conf = ucsdet_getConfidence(match, &status); |
| 401 | |
| 402 | if (strcmp(name, "UTF-16LE") != 0) { |
| 403 | errln("Encoding detection failure for UTF-16LE: got %s", name); |
| 404 | return; |
| 405 | } |
| 406 | |
| 407 | if (conf != 100) { |
| 408 | errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); |
| 409 | } |
| 410 | } |
| 411 | |
| 412 | void CharsetDetectionTest::InputFilterTest() |
| 413 | { |
| 414 | UErrorCode status = U_ZERO_ERROR; |
| 415 | UnicodeString s(u"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>"); |
| 416 | int32_t byteLength = 0; |
| 417 | char *bytes = extractBytes(s, "ISO-8859-1", byteLength); |
| 418 | UCharsetDetector *csd = ucsdet_open(&status); |
| 419 | const UCharsetMatch *match; |
| 420 | const char *lang, *name; |
| 421 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 422 | ucsdet_enableInputFilter(csd, true); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 423 | |
| 424 | if (!ucsdet_isInputFilterEnabled(csd)) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 425 | errln("ucsdet_enableInputFilter(csd, true) did not enable input filter!"); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 426 | } |
| 427 | |
| 428 | |
| 429 | ucsdet_setText(csd, bytes, byteLength, &status); |
| 430 | match = ucsdet_detect(csd, &status); |
| 431 | |
| 432 | if (match == NULL) { |
| 433 | errln("Turning on the input filter resulted in no matches."); |
| 434 | goto turn_off; |
| 435 | } |
| 436 | |
| 437 | name = ucsdet_getName(match, &status); |
| 438 | |
| 439 | if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| 440 | errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); |
| 441 | } else { |
| 442 | lang = ucsdet_getLanguage(match, &status); |
| 443 | |
| 444 | if (lang == NULL || strcmp(lang, "fr") != 0) { |
| 445 | errln("Input filter did not strip markup!"); |
| 446 | } |
| 447 | } |
| 448 | |
| 449 | turn_off: |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 450 | ucsdet_enableInputFilter(csd, false); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 451 | ucsdet_setText(csd, bytes, byteLength, &status); |
| 452 | match = ucsdet_detect(csd, &status); |
| 453 | |
| 454 | if (match == NULL) { |
| 455 | errln("Turning off the input filter resulted in no matches."); |
| 456 | goto bail; |
| 457 | } |
| 458 | |
| 459 | name = ucsdet_getName(match, &status); |
| 460 | |
| 461 | if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| 462 | errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); |
| 463 | } else { |
| 464 | lang = ucsdet_getLanguage(match, &status); |
| 465 | |
| 466 | if (lang == NULL || strcmp(lang, "en") != 0) { |
| 467 | errln("Unfiltered input did not detect as English!"); |
| 468 | } |
| 469 | } |
| 470 | |
| 471 | bail: |
| 472 | delete[] bytes; |
| 473 | ucsdet_close(csd); |
| 474 | } |
| 475 | |
| 476 | void CharsetDetectionTest::C1BytesTest() |
| 477 | { |
| 478 | #if !UCONFIG_NO_LEGACY_CONVERSION |
| 479 | UErrorCode status = U_ZERO_ERROR; |
| 480 | UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; |
| 481 | UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); |
| 482 | UnicodeString sWindows = ssWindows.unescape(); |
| 483 | int32_t lISO = 0, lWindows = 0; |
| 484 | char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); |
| 485 | char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); |
| 486 | UCharsetDetector *csd = ucsdet_open(&status); |
| 487 | const UCharsetMatch *match; |
| 488 | const char *name; |
| 489 | |
| 490 | ucsdet_setText(csd, bWindows, lWindows, &status); |
| 491 | match = ucsdet_detect(csd, &status); |
| 492 | |
| 493 | if (match == NULL) { |
| 494 | errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status)); |
| 495 | goto bail; |
| 496 | } |
| 497 | |
| 498 | name = ucsdet_getName(match, &status); |
| 499 | |
| 500 | if (strcmp(name, "windows-1252") != 0) { |
| 501 | errln("English text with C1 bytes does not detect as windows-1252, but as %s", name); |
| 502 | } |
| 503 | |
| 504 | ucsdet_setText(csd, bISO, lISO, &status); |
| 505 | match = ucsdet_detect(csd, &status); |
| 506 | |
| 507 | if (match == NULL) { |
| 508 | errln("English text without C1 bytes got no matches."); |
| 509 | goto bail; |
| 510 | } |
| 511 | |
| 512 | name = ucsdet_getName(match, &status); |
| 513 | |
| 514 | if (strcmp(name, "ISO-8859-1") != 0) { |
| 515 | errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name); |
| 516 | } |
| 517 | |
| 518 | bail: |
| 519 | delete[] bWindows; |
| 520 | delete[] bISO; |
| 521 | |
| 522 | ucsdet_close(csd); |
| 523 | #endif |
| 524 | } |
| 525 | |
| 526 | void CharsetDetectionTest::DetectionTest() |
| 527 | { |
| 528 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 529 | UErrorCode status = U_ZERO_ERROR; |
| 530 | char path[2048]; |
| 531 | const char *testFilePath = getPath(path, "csdetest.xml"); |
| 532 | |
| 533 | if (testFilePath == NULL) { |
| 534 | return; /* Couldn't get path: error message already output. */ |
| 535 | } |
| 536 | |
| 537 | UXMLParser *parser = UXMLParser::createParser(status); |
| 538 | if (U_FAILURE(status)) { |
| 539 | dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); |
| 540 | return; |
| 541 | } |
| 542 | |
| 543 | UXMLElement *root = parser->parseFile(testFilePath, status); |
| 544 | if (!assertSuccess( "parseFile",status)) return; |
| 545 | |
| 546 | UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); |
| 547 | UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); |
| 548 | UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); |
| 549 | |
| 550 | const UXMLElement *testCase; |
| 551 | int32_t tc = 0; |
| 552 | |
| 553 | while((testCase = root->nextChildElement(tc)) != NULL) { |
| 554 | if (testCase->getTagName().compare(test_case) == 0) { |
| 555 | const UnicodeString *id = testCase->getAttribute(id_attr); |
| 556 | const UnicodeString *encodings = testCase->getAttribute(enc_attr); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 557 | const UnicodeString text = testCase->getText(true); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 558 | int32_t encodingCount; |
| 559 | UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount); |
| 560 | |
| 561 | for(int32_t e = 0; e < encodingCount; e += 1) { |
| 562 | checkEncoding(text, encodingList[e], *id); |
| 563 | } |
| 564 | |
| 565 | delete[] encodingList; |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | delete root; |
| 570 | delete parser; |
| 571 | #endif |
| 572 | } |
| 573 | |
| 574 | void CharsetDetectionTest::IBM424Test() |
| 575 | { |
| 576 | #if !UCONFIG_ONLY_HTML_CONVERSION |
| 577 | UErrorCode status = U_ZERO_ERROR; |
| 578 | |
| 579 | static const UChar chars[] = { |
| 580 | 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, |
| 581 | 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, |
| 582 | 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, |
| 583 | 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, |
| 584 | 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, |
| 585 | 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, |
| 586 | 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, |
| 587 | 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, |
| 588 | 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, |
| 589 | 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, |
| 590 | 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, |
| 591 | 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, |
| 592 | 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, |
| 593 | 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, |
| 594 | 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, |
| 595 | 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, |
| 596 | 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 |
| 597 | }; |
| 598 | |
| 599 | static const UChar chars_reverse[] = { |
| 600 | 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, |
| 601 | 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, |
| 602 | 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, |
| 603 | 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, |
| 604 | 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, |
| 605 | 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, |
| 606 | 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, |
| 607 | 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, |
| 608 | 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, |
| 609 | 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, |
| 610 | 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, |
| 611 | 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, |
| 612 | 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, |
| 613 | 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, |
| 614 | 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, |
| 615 | 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, |
| 616 | 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, |
| 617 | 0x0000 |
| 618 | }; |
| 619 | |
| 620 | int32_t bLength = 0, brLength = 0; |
| 621 | |
| 622 | UnicodeString s1(chars); |
| 623 | UnicodeString s2(chars_reverse); |
| 624 | |
| 625 | char *bytes = extractBytes(s1, "IBM424", bLength); |
| 626 | char *bytes_r = extractBytes(s2, "IBM424", brLength); |
| 627 | |
| 628 | UCharsetDetector *csd = ucsdet_open(&status); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 629 | ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status); |
| 630 | ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status); |
| 631 | ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status); |
| 632 | ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 633 | if (U_FAILURE(status)) { |
| 634 | errln("Error opening charset detector. - %s", u_errorName(status)); |
| 635 | } |
| 636 | const UCharsetMatch *match; |
| 637 | const char *name; |
| 638 | |
| 639 | ucsdet_setText(csd, bytes, bLength, &status); |
| 640 | match = ucsdet_detect(csd, &status); |
| 641 | |
| 642 | if (match == NULL) { |
| 643 | errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status)); |
| 644 | goto bail; |
| 645 | } |
| 646 | |
| 647 | name = ucsdet_getName(match, &status); |
| 648 | if (strcmp(name, "IBM424_rtl") != 0) { |
| 649 | errln("Encoding detection failure for IBM424_rtl: got %s", name); |
| 650 | } |
| 651 | |
| 652 | ucsdet_setText(csd, bytes_r, brLength, &status); |
| 653 | match = ucsdet_detect(csd, &status); |
| 654 | |
| 655 | if (match == NULL) { |
| 656 | errln("Encoding detection failure for IBM424_ltr: got no matches."); |
| 657 | goto bail; |
| 658 | } |
| 659 | |
| 660 | name = ucsdet_getName(match, &status); |
| 661 | if (strcmp(name, "IBM424_ltr") != 0) { |
| 662 | errln("Encoding detection failure for IBM424_ltr: got %s", name); |
| 663 | } |
| 664 | |
| 665 | bail: |
| 666 | delete[] bytes; |
| 667 | delete[] bytes_r; |
| 668 | ucsdet_close(csd); |
| 669 | #endif |
| 670 | } |
| 671 | |
| 672 | void CharsetDetectionTest::IBM420Test() |
| 673 | { |
| 674 | #if !UCONFIG_ONLY_HTML_CONVERSION |
| 675 | UErrorCode status = U_ZERO_ERROR; |
| 676 | |
| 677 | static const UChar chars[] = { |
| 678 | 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, |
| 679 | 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, |
| 680 | 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, |
| 681 | 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, |
| 682 | 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, |
| 683 | 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, |
| 684 | 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, |
| 685 | 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, |
| 686 | 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, |
| 687 | 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, |
| 688 | 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, |
| 689 | 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, |
| 690 | 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, |
| 691 | 0x0000 |
| 692 | }; |
| 693 | static const UChar chars_reverse[] = { |
| 694 | 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, |
| 695 | 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, |
| 696 | 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, |
| 697 | 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, |
| 698 | 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, |
| 699 | 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, |
| 700 | 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, |
| 701 | 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, |
| 702 | 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, |
| 703 | 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, |
| 704 | 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, |
| 705 | 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, |
| 706 | 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, |
| 707 | 0x0000, |
| 708 | }; |
| 709 | |
| 710 | int32_t bLength = 0, brLength = 0; |
| 711 | |
| 712 | UnicodeString s1(chars); |
| 713 | UnicodeString s2(chars_reverse); |
| 714 | |
| 715 | char *bytes = extractBytes(s1, "IBM420", bLength); |
| 716 | char *bytes_r = extractBytes(s2, "IBM420", brLength); |
| 717 | |
| 718 | UCharsetDetector *csd = ucsdet_open(&status); |
| 719 | if (U_FAILURE(status)) { |
| 720 | errln("Error opening charset detector. - %s", u_errorName(status)); |
| 721 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 722 | ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status); |
| 723 | ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status); |
| 724 | ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status); |
| 725 | ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 726 | const UCharsetMatch *match; |
| 727 | const char *name; |
| 728 | |
| 729 | ucsdet_setText(csd, bytes, bLength, &status); |
| 730 | match = ucsdet_detect(csd, &status); |
| 731 | |
| 732 | if (match == NULL) { |
| 733 | errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status)); |
| 734 | goto bail; |
| 735 | } |
| 736 | |
| 737 | name = ucsdet_getName(match, &status); |
| 738 | if (strcmp(name, "IBM420_rtl") != 0) { |
| 739 | errln("Encoding detection failure for IBM420_rtl: got %s\n", name); |
| 740 | } |
| 741 | |
| 742 | ucsdet_setText(csd, bytes_r, brLength, &status); |
| 743 | match = ucsdet_detect(csd, &status); |
| 744 | |
| 745 | if (match == NULL) { |
| 746 | errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); |
| 747 | goto bail; |
| 748 | } |
| 749 | |
| 750 | name = ucsdet_getName(match, &status); |
| 751 | if (strcmp(name, "IBM420_ltr") != 0) { |
| 752 | errln("Encoding detection failure for IBM420_ltr: got %s\n", name); |
| 753 | } |
| 754 | |
| 755 | bail: |
| 756 | delete[] bytes; |
| 757 | delete[] bytes_r; |
| 758 | ucsdet_close(csd); |
| 759 | #endif |
| 760 | } |
| 761 | |
| 762 | |
| 763 | void CharsetDetectionTest::Ticket6394Test() { |
| 764 | #if !UCONFIG_NO_CONVERSION |
| 765 | const char charText[] = "Here is some random English text that should be detected as ISO-8859-1." |
| 766 | "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected " |
| 767 | "encodings more than once. The hop through UnicodeString is for platforms " |
| 768 | "where this char * string is be EBCDIC and needs conversion to Latin1."; |
| 769 | char latin1Text[sizeof(charText)]; |
| 770 | UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1"); |
| 771 | |
| 772 | UErrorCode status = U_ZERO_ERROR; |
| 773 | UCharsetDetector *csd = ucsdet_open(&status); |
| 774 | ucsdet_setText(csd, latin1Text, -1, &status); |
| 775 | if (U_FAILURE(status)) { |
| 776 | errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); |
| 777 | return; |
| 778 | } |
| 779 | |
| 780 | int32_t matchCount = 0; |
| 781 | const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); |
| 782 | if (U_FAILURE(status)) { |
| 783 | errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); |
| 784 | return; |
| 785 | } |
| 786 | |
| 787 | UnicodeSet setOfCharsetNames; // UnicodeSets can hold strings. |
| 788 | int32_t i; |
| 789 | for (i=0; i<matchCount; i++) { |
| 790 | UnicodeString charSetName(ucsdet_getName(matches[i], &status)); |
| 791 | if (U_FAILURE(status)) { |
| 792 | errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i); |
| 793 | status = U_ZERO_ERROR; |
| 794 | } |
| 795 | if (setOfCharsetNames.contains(charSetName)) { |
| 796 | errln("Fail at file %s, line %d ", __FILE__, __LINE__); |
| 797 | errln(UnicodeString(" Duplicate charset name = ") + charSetName); |
| 798 | } |
| 799 | setOfCharsetNames.add(charSetName); |
| 800 | } |
| 801 | ucsdet_close(csd); |
| 802 | #endif |
| 803 | } |
| 804 | |
| 805 | |
| 806 | // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between |
| 807 | // similar Windows and non-Windows SBCS encodings. State was kept in the shared |
| 808 | // Charset Recognizer objects, and could be overwritten. |
| 809 | void CharsetDetectionTest::Ticket6954Test() { |
| 810 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING |
| 811 | UErrorCode status = U_ZERO_ERROR; |
| 812 | UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; |
| 813 | UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly." |
| 814 | "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); |
| 815 | UnicodeString sWindows = ssWindows.unescape(); |
| 816 | int32_t lISO = 0, lWindows = 0; |
| 817 | std::unique_ptr<char[]> bISO(extractBytes(sISO, "ISO-8859-1", lISO)); |
| 818 | std::unique_ptr<char[]> bWindows(extractBytes(sWindows, "windows-1252", lWindows)); |
| 819 | |
| 820 | // First do a plain vanilla detect of 1252 text |
| 821 | |
| 822 | LocalUCharsetDetectorPointer csd1(ucsdet_open(&status)); |
| 823 | ucsdet_setText(csd1.getAlias(), bWindows.get(), lWindows, &status); |
| 824 | const UCharsetMatch *match1 = ucsdet_detect(csd1.getAlias(), &status); |
| 825 | const char *name1 = ucsdet_getName(match1, &status); |
| 826 | TEST_ASSERT_SUCCESS(status); |
| 827 | TEST_ASSERT(strcmp(name1, "windows-1252")==0); |
| 828 | |
| 829 | // Next, using a completely separate detector, detect some 8859-1 text |
| 830 | |
| 831 | LocalUCharsetDetectorPointer csd2(ucsdet_open(&status)); |
| 832 | ucsdet_setText(csd2.getAlias(), bISO.get(), lISO, &status); |
| 833 | const UCharsetMatch *match2 = ucsdet_detect(csd2.getAlias(), &status); |
| 834 | const char *name2 = ucsdet_getName(match2, &status); |
| 835 | TEST_ASSERT_SUCCESS(status); |
| 836 | TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0); |
| 837 | |
| 838 | // Recheck the 1252 results from the first detector, which should not have been |
| 839 | // altered by the use of a different detector. |
| 840 | |
| 841 | name1 = ucsdet_getName(match1, &status); |
| 842 | TEST_ASSERT_SUCCESS(status); |
| 843 | TEST_ASSERT(strcmp(name1, "windows-1252")==0); |
| 844 | #endif |
| 845 | } |
Frank Tang | d2858cb | 2022-04-08 20:34:12 -0700 | [diff] [blame] | 846 | |
| 847 | |
| 848 | // Ticket 21823 - Issue with Charset Detector for ill-formed input strings. |
| 849 | // Its fix involves returning a failure based error code |
| 850 | // (U_INVALID_CHAR_FOUND) incase no charsets appear to match the input data. |
| 851 | void CharsetDetectionTest::Ticket21823Test() { |
| 852 | UErrorCode status = U_ZERO_ERROR; |
| 853 | std::string str = "\x80"; |
| 854 | UCharsetDetector* csd = ucsdet_open(&status); |
| 855 | |
| 856 | ucsdet_setText(csd, str.data(), str.length(), &status); |
| 857 | const UCharsetMatch* match = ucsdet_detect(csd, &status); |
| 858 | |
| 859 | if (match == NULL) { |
| 860 | TEST_ASSERT(U_FAILURE(status)); |
| 861 | } |
| 862 | |
| 863 | ucsdet_close(csd); |
| 864 | } |