Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /******************************************************************** |
| 4 | * COPYRIGHT: |
| 5 | * Copyright (c) 1997-2016, International Business Machines Corporation and |
| 6 | * others. All Rights Reserved. |
| 7 | ********************************************************************/ |
| 8 | |
| 9 | #include "unicode/utypes.h" |
| 10 | |
| 11 | #if !UCONFIG_NO_COLLATION |
| 12 | |
| 13 | #include "unicode/coll.h" |
| 14 | #include "unicode/localpointer.h" |
| 15 | #include "unicode/tblcoll.h" |
| 16 | #include "unicode/unistr.h" |
| 17 | #include "unicode/sortkey.h" |
| 18 | #include "regcoll.h" |
| 19 | #include "sfwdchit.h" |
| 20 | #include "testutil.h" |
| 21 | #include "cmemory.h" |
| 22 | |
| 23 | CollationRegressionTest::CollationRegressionTest() |
| 24 | { |
| 25 | UErrorCode status = U_ZERO_ERROR; |
| 26 | |
| 27 | en_us = (RuleBasedCollator *)Collator::createInstance(Locale::getUS(), status); |
| 28 | if(U_FAILURE(status)) { |
| 29 | delete en_us; |
| 30 | en_us = 0; |
| 31 | errcheckln(status, "Collator creation failed with %s", u_errorName(status)); |
| 32 | return; |
| 33 | } |
| 34 | } |
| 35 | |
| 36 | CollationRegressionTest::~CollationRegressionTest() |
| 37 | { |
| 38 | delete en_us; |
| 39 | } |
| 40 | |
| 41 | |
| 42 | // @bug 4048446 |
| 43 | // |
| 44 | // CollationElementIterator.reset() doesn't work |
| 45 | // |
| 46 | void CollationRegressionTest::Test4048446(/* char* par */) |
| 47 | { |
| 48 | const UnicodeString test1 = "XFILE What subset of all possible test cases has the highest probability of detecting the most errors?"; |
| 49 | const UnicodeString test2 = "Xf_ile What subset of all possible test cases has the lowest probability of detecting the least errors?"; |
| 50 | CollationElementIterator *i1 = en_us->createCollationElementIterator(test1); |
| 51 | CollationElementIterator *i2 = en_us->createCollationElementIterator(test1); |
| 52 | UErrorCode status = U_ZERO_ERROR; |
| 53 | |
| 54 | if (i1 == NULL|| i2 == NULL) |
| 55 | { |
| 56 | errln("Could not create CollationElementIterator's"); |
| 57 | delete i1; |
| 58 | delete i2; |
| 59 | return; |
| 60 | } |
| 61 | |
| 62 | while (i1->next(status) != CollationElementIterator::NULLORDER) |
| 63 | { |
| 64 | if (U_FAILURE(status)) |
| 65 | { |
| 66 | errln("error calling next()"); |
| 67 | |
| 68 | delete i1; |
| 69 | delete i2; |
| 70 | return; |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | i1->reset(); |
| 75 | |
| 76 | assertEqual(*i1, *i2); |
| 77 | |
| 78 | delete i1; |
| 79 | delete i2; |
| 80 | } |
| 81 | |
| 82 | // @bug 4051866 |
| 83 | // |
| 84 | // Collator -> rules -> Collator round-trip broken for expanding characters |
| 85 | // |
| 86 | void CollationRegressionTest::Test4051866(/* char* par */) |
| 87 | { |
| 88 | UnicodeString rules; |
| 89 | UErrorCode status = U_ZERO_ERROR; |
| 90 | |
| 91 | rules += "&n < o "; |
| 92 | rules += "& oe ,o"; |
| 93 | rules += (UChar)0x3080; |
| 94 | rules += "& oe ,"; |
| 95 | rules += (UChar)0x1530; |
| 96 | rules += " ,O"; |
| 97 | rules += "& OE ,O"; |
| 98 | rules += (UChar)0x3080; |
| 99 | rules += "& OE ,"; |
| 100 | rules += (UChar)0x1520; |
| 101 | rules += "< p ,P"; |
| 102 | |
| 103 | // Build a collator containing expanding characters |
| 104 | LocalPointer<RuleBasedCollator> c1(new RuleBasedCollator(rules, status), status); |
| 105 | if (U_FAILURE(status)) { |
| 106 | errln("RuleBasedCollator(rule string) failed - %s", u_errorName(status)); |
| 107 | return; |
| 108 | } |
| 109 | |
| 110 | // Build another using the rules from the first |
| 111 | LocalPointer<RuleBasedCollator> c2(new RuleBasedCollator(c1->getRules(), status), status); |
| 112 | if (U_FAILURE(status)) { |
| 113 | errln("RuleBasedCollator(rule string from other RBC) failed - %s", u_errorName(status)); |
| 114 | return; |
| 115 | } |
| 116 | |
| 117 | // Make sure they're the same |
| 118 | if (!(c1->getRules() == c2->getRules())) |
| 119 | { |
| 120 | errln("Rules are not equal"); |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | // @bug 4053636 |
| 125 | // |
| 126 | // Collator thinks "black-bird" == "black" |
| 127 | // |
| 128 | void CollationRegressionTest::Test4053636(/* char* par */) |
| 129 | { |
| 130 | if (en_us->equals("black_bird", "black")) |
| 131 | { |
| 132 | errln("black-bird == black"); |
| 133 | } |
| 134 | } |
| 135 | |
| 136 | // @bug 4054238 |
| 137 | // |
| 138 | // CollationElementIterator will not work correctly if the associated |
| 139 | // Collator object's mode is changed |
| 140 | // |
| 141 | void CollationRegressionTest::Test4054238(/* char* par */) |
| 142 | { |
| 143 | const UChar chars3[] = {0x61, 0x00FC, 0x62, 0x65, 0x63, 0x6b, 0x20, 0x47, 0x72, 0x00F6, 0x00DF, 0x65, 0x20, 0x4c, 0x00FC, 0x62, 0x63, 0x6b, 0}; |
| 144 | const UnicodeString test3(chars3); |
| 145 | RuleBasedCollator *c = en_us->clone(); |
| 146 | |
| 147 | // NOTE: The Java code uses en_us to create the CollationElementIterators |
| 148 | // but I'm pretty sure that's wrong, so I've changed this to use c. |
| 149 | UErrorCode status = U_ZERO_ERROR; |
| 150 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 151 | CollationElementIterator *i1 = c->createCollationElementIterator(test3); |
| 152 | delete i1; |
| 153 | delete c; |
| 154 | } |
| 155 | |
| 156 | // @bug 4054734 |
| 157 | // |
| 158 | // Collator::IDENTICAL documented but not implemented |
| 159 | // |
| 160 | void CollationRegressionTest::Test4054734(/* char* par */) |
| 161 | { |
| 162 | /* |
| 163 | Here's the original Java: |
| 164 | |
| 165 | String[] decomp = { |
| 166 | "\u0001", "<", "\u0002", |
| 167 | "\u0001", "=", "\u0001", |
| 168 | "A\u0001", ">", "~\u0002", // Ensure A and ~ are not compared bitwise |
| 169 | "\u00C0", "=", "A\u0300" // Decomp should make these equal |
| 170 | }; |
| 171 | |
| 172 | String[] nodecomp = { |
| 173 | "\u00C0", ">", "A\u0300" // A-grave vs. A combining-grave |
| 174 | }; |
| 175 | */ |
| 176 | |
| 177 | static const UChar decomp[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 178 | { |
| 179 | {0x0001, 0}, {0x3c, 0}, {0x0002, 0}, |
| 180 | {0x0001, 0}, {0x3d, 0}, {0x0001, 0}, |
| 181 | {0x41, 0x0001, 0}, {0x3e, 0}, {0x7e, 0x0002, 0}, |
| 182 | {0x00c0, 0}, {0x3d, 0}, {0x41, 0x0300, 0} |
| 183 | }; |
| 184 | |
| 185 | |
| 186 | UErrorCode status = U_ZERO_ERROR; |
| 187 | RuleBasedCollator *c = en_us->clone(); |
| 188 | |
| 189 | c->setStrength(Collator::IDENTICAL); |
| 190 | |
| 191 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 192 | compareArray(*c, decomp, UPRV_LENGTHOF(decomp)); |
| 193 | |
| 194 | delete c; |
| 195 | } |
| 196 | |
| 197 | // @bug 4054736 |
| 198 | // |
| 199 | // Full Decomposition mode not implemented |
| 200 | // |
| 201 | void CollationRegressionTest::Test4054736(/* char* par */) |
| 202 | { |
| 203 | UErrorCode status = U_ZERO_ERROR; |
| 204 | RuleBasedCollator *c = en_us->clone(); |
| 205 | |
| 206 | c->setStrength(Collator::SECONDARY); |
| 207 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 208 | |
| 209 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 210 | { |
| 211 | {0xFB4F, 0}, {0x3d, 0}, {0x05D0, 0x05DC} // Alef-Lamed vs. Alef, Lamed |
| 212 | }; |
| 213 | |
| 214 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 215 | |
| 216 | delete c; |
| 217 | } |
| 218 | |
| 219 | // @bug 4058613 |
| 220 | // |
| 221 | // Collator::createInstance() causes an ArrayIndexOutofBoundsException for Korean |
| 222 | // |
| 223 | void CollationRegressionTest::Test4058613(/* char* par */) |
| 224 | { |
| 225 | // Creating a default collator doesn't work when Korean is the default |
| 226 | // locale |
| 227 | |
| 228 | Locale oldDefault = Locale::getDefault(); |
| 229 | UErrorCode status = U_ZERO_ERROR; |
| 230 | |
| 231 | Locale::setDefault(Locale::getKorean(), status); |
| 232 | |
| 233 | if (U_FAILURE(status)) |
| 234 | { |
| 235 | errln("Could not set default locale to Locale::KOREAN"); |
| 236 | return; |
| 237 | } |
| 238 | |
| 239 | Collator *c = NULL; |
| 240 | |
| 241 | c = Collator::createInstance("en_US", status); |
| 242 | |
| 243 | if (c == NULL || U_FAILURE(status)) |
| 244 | { |
| 245 | errln("Could not create a Korean collator"); |
| 246 | Locale::setDefault(oldDefault, status); |
| 247 | delete c; |
| 248 | return; |
| 249 | } |
| 250 | |
| 251 | // Since the fix to this bug was to turn off decomposition for Korean collators, |
| 252 | // ensure that's what we got |
| 253 | if (c->getAttribute(UCOL_NORMALIZATION_MODE, status) != UCOL_OFF) |
| 254 | { |
| 255 | errln("Decomposition is not set to NO_DECOMPOSITION for Korean collator"); |
| 256 | } |
| 257 | |
| 258 | delete c; |
| 259 | |
| 260 | Locale::setDefault(oldDefault, status); |
| 261 | } |
| 262 | |
| 263 | // @bug 4059820 |
| 264 | // |
| 265 | // RuleBasedCollator.getRules does not return the exact pattern as input |
| 266 | // for expanding character sequences |
| 267 | // |
| 268 | void CollationRegressionTest::Test4059820(/* char* par */) |
| 269 | { |
| 270 | UErrorCode status = U_ZERO_ERROR; |
| 271 | |
| 272 | RuleBasedCollator *c = NULL; |
| 273 | UnicodeString rules = "&9 < a < b , c/a < d < z"; |
| 274 | |
| 275 | c = new RuleBasedCollator(rules, status); |
| 276 | |
| 277 | if (c == NULL || U_FAILURE(status)) |
| 278 | { |
| 279 | errln("Failure building a collator."); |
| 280 | delete c; |
| 281 | return; |
| 282 | } |
| 283 | |
| 284 | if ( c->getRules().indexOf("c/a") == -1) |
| 285 | { |
| 286 | errln("returned rules do not contain 'c/a'"); |
| 287 | } |
| 288 | |
| 289 | delete c; |
| 290 | } |
| 291 | |
| 292 | // @bug 4060154 |
| 293 | // |
| 294 | // MergeCollation::fixEntry broken for "& H < \u0131, \u0130, i, I" |
| 295 | // |
| 296 | void CollationRegressionTest::Test4060154(/* char* par */) |
| 297 | { |
| 298 | UErrorCode status = U_ZERO_ERROR; |
| 299 | UnicodeString rules; |
| 300 | |
| 301 | rules += "&f < g, G < h, H < i, I < j, J"; |
| 302 | rules += " & H < "; |
| 303 | rules += (UChar)0x0131; |
| 304 | rules += ", "; |
| 305 | rules += (UChar)0x0130; |
| 306 | rules += ", i, I"; |
| 307 | |
| 308 | RuleBasedCollator *c = NULL; |
| 309 | |
| 310 | c = new RuleBasedCollator(rules, status); |
| 311 | |
| 312 | if (c == NULL || U_FAILURE(status)) |
| 313 | { |
| 314 | errln("failure building collator."); |
| 315 | delete c; |
| 316 | return; |
| 317 | } |
| 318 | |
| 319 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 320 | |
| 321 | /* |
| 322 | String[] tertiary = { |
| 323 | "A", "<", "B", |
| 324 | "H", "<", "\u0131", |
| 325 | "H", "<", "I", |
| 326 | "\u0131", "<", "\u0130", |
| 327 | "\u0130", "<", "i", |
| 328 | "\u0130", ">", "H", |
| 329 | }; |
| 330 | */ |
| 331 | |
| 332 | static const UChar tertiary[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 333 | { |
| 334 | {0x41, 0}, {0x3c, 0}, {0x42, 0}, |
| 335 | {0x48, 0}, {0x3c, 0}, {0x0131, 0}, |
| 336 | {0x48, 0}, {0x3c, 0}, {0x49, 0}, |
| 337 | {0x0131, 0}, {0x3c, 0}, {0x0130, 0}, |
| 338 | {0x0130, 0}, {0x3c, 0}, {0x69, 0}, |
| 339 | {0x0130, 0}, {0x3e, 0}, {0x48, 0} |
| 340 | }; |
| 341 | |
| 342 | c->setStrength(Collator::TERTIARY); |
| 343 | compareArray(*c, tertiary, UPRV_LENGTHOF(tertiary)); |
| 344 | |
| 345 | /* |
| 346 | String[] secondary = { |
| 347 | "H", "<", "I", |
| 348 | "\u0131", "=", "\u0130", |
| 349 | }; |
| 350 | */ |
| 351 | static const UChar secondary[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 352 | { |
| 353 | {0x48, 0}, {0x3c, 0}, {0x49, 0}, |
| 354 | {0x0131, 0}, {0x3d, 0}, {0x0130, 0} |
| 355 | }; |
| 356 | |
| 357 | c->setStrength(Collator::PRIMARY); |
| 358 | compareArray(*c, secondary, UPRV_LENGTHOF(secondary)); |
| 359 | |
| 360 | delete c; |
| 361 | } |
| 362 | |
| 363 | // @bug 4062418 |
| 364 | // |
| 365 | // Secondary/Tertiary comparison incorrect in French Secondary |
| 366 | // |
| 367 | void CollationRegressionTest::Test4062418(/* char* par */) |
| 368 | { |
| 369 | UErrorCode status = U_ZERO_ERROR; |
| 370 | |
| 371 | RuleBasedCollator *c = NULL; |
| 372 | |
| 373 | c = (RuleBasedCollator *) Collator::createInstance(Locale::getCanadaFrench(), status); |
| 374 | |
| 375 | if (c == NULL || U_FAILURE(status)) |
| 376 | { |
| 377 | errln("Failed to create collator for Locale::getCanadaFrench()"); |
| 378 | delete c; |
| 379 | return; |
| 380 | } |
| 381 | |
| 382 | c->setStrength(Collator::SECONDARY); |
| 383 | |
| 384 | /* |
| 385 | String[] tests = { |
| 386 | "p\u00eache", "<", "p\u00e9ch\u00e9", // Comparing accents from end, p\u00e9ch\u00e9 is greater |
| 387 | }; |
| 388 | */ |
| 389 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 390 | { |
| 391 | {0x70, 0x00EA, 0x63, 0x68, 0x65, 0}, {0x3c, 0}, {0x70, 0x00E9, 0x63, 0x68, 0x00E9, 0} |
| 392 | }; |
| 393 | |
| 394 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 395 | |
| 396 | delete c; |
| 397 | } |
| 398 | |
| 399 | // @bug 4065540 |
| 400 | // |
| 401 | // Collator::compare() method broken if either string contains spaces |
| 402 | // |
| 403 | void CollationRegressionTest::Test4065540(/* char* par */) |
| 404 | { |
| 405 | if (en_us->compare("abcd e", "abcd f") == 0) |
| 406 | { |
| 407 | errln("'abcd e' == 'abcd f'"); |
| 408 | } |
| 409 | } |
| 410 | |
| 411 | // @bug 4066189 |
| 412 | // |
| 413 | // Unicode characters need to be recursively decomposed to get the |
| 414 | // correct result. For example, |
| 415 | // u1EB1 -> \u0103 + \u0300 -> a + \u0306 + \u0300. |
| 416 | // |
| 417 | void CollationRegressionTest::Test4066189(/* char* par */) |
| 418 | { |
| 419 | static const UChar chars1[] = {0x1EB1, 0}; |
| 420 | static const UChar chars2[] = {0x61, 0x0306, 0x0300, 0}; |
| 421 | const UnicodeString test1(chars1); |
| 422 | const UnicodeString test2(chars2); |
| 423 | UErrorCode status = U_ZERO_ERROR; |
| 424 | |
| 425 | // NOTE: The java code used en_us to create the |
| 426 | // CollationElementIterator's. I'm pretty sure that |
| 427 | // was wrong, so I've change the code to use c1 and c2 |
| 428 | RuleBasedCollator *c1 = en_us->clone(); |
| 429 | c1->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 430 | CollationElementIterator *i1 = c1->createCollationElementIterator(test1); |
| 431 | |
| 432 | RuleBasedCollator *c2 = en_us->clone(); |
| 433 | c2->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, status); |
| 434 | CollationElementIterator *i2 = c2->createCollationElementIterator(test2); |
| 435 | |
| 436 | assertEqual(*i1, *i2); |
| 437 | |
| 438 | delete i2; |
| 439 | delete c2; |
| 440 | delete i1; |
| 441 | delete c1; |
| 442 | } |
| 443 | |
| 444 | // @bug 4066696 |
| 445 | // |
| 446 | // French secondary collation checking at the end of compare iteration fails |
| 447 | // |
| 448 | void CollationRegressionTest::Test4066696(/* char* par */) |
| 449 | { |
| 450 | UErrorCode status = U_ZERO_ERROR; |
| 451 | RuleBasedCollator *c = NULL; |
| 452 | |
| 453 | c = (RuleBasedCollator *)Collator::createInstance(Locale::getCanadaFrench(), status); |
| 454 | |
| 455 | if (c == NULL || U_FAILURE(status)) |
| 456 | { |
| 457 | errln("Failure creating collator for Locale::getCanadaFrench()"); |
| 458 | delete c; |
| 459 | return; |
| 460 | } |
| 461 | |
| 462 | c->setStrength(Collator::SECONDARY); |
| 463 | |
| 464 | /* |
| 465 | String[] tests = { |
| 466 | "\u00e0", "<", "\u01fa", // a-grave < A-ring-acute |
| 467 | }; |
| 468 | |
| 469 | should be: |
| 470 | |
| 471 | String[] tests = { |
| 472 | "\u00e0", ">", "\u01fa", // a-grave < A-ring-acute |
| 473 | }; |
| 474 | |
| 475 | */ |
| 476 | |
| 477 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 478 | { |
| 479 | {0x00E0, 0}, {0x3e, 0}, {0x01FA, 0} |
| 480 | }; |
| 481 | |
| 482 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 483 | |
| 484 | delete c; |
| 485 | } |
| 486 | |
| 487 | // @bug 4076676 |
| 488 | // |
| 489 | // Bad canonicalization of same-class combining characters |
| 490 | // |
| 491 | void CollationRegressionTest::Test4076676(/* char* par */) |
| 492 | { |
| 493 | // These combining characters are all in the same class, so they should not |
| 494 | // be reordered, and they should compare as unequal. |
| 495 | static const UChar s1[] = {0x41, 0x0301, 0x0302, 0x0300, 0}; |
| 496 | static const UChar s2[] = {0x41, 0x0302, 0x0300, 0x0301, 0}; |
| 497 | |
| 498 | RuleBasedCollator *c = en_us->clone(); |
| 499 | c->setStrength(Collator::TERTIARY); |
| 500 | |
| 501 | if (c->compare(s1,s2) == 0) |
| 502 | { |
| 503 | errln("Same-class combining chars were reordered"); |
| 504 | } |
| 505 | |
| 506 | delete c; |
| 507 | } |
| 508 | |
| 509 | // @bug 4079231 |
| 510 | // |
| 511 | // RuleBasedCollator::operator==(NULL) throws NullPointerException |
| 512 | // |
| 513 | void CollationRegressionTest::Test4079231(/* char* par */) |
| 514 | { |
| 515 | // I don't think there's any way to write this test |
| 516 | // in C++. The following is equivalent to the Java, |
| 517 | // but doesn't compile 'cause NULL can't be converted |
| 518 | // to Collator& |
| 519 | // |
| 520 | // if (en_us->operator==(NULL)) |
| 521 | // { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 522 | // errln("en_us->operator==(NULL) returned true"); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 523 | // } |
| 524 | |
| 525 | /* |
| 526 | try { |
| 527 | if (en_us->equals(null)) { |
| 528 | errln("en_us->equals(null) returned true"); |
| 529 | } |
| 530 | } |
| 531 | catch (Exception e) { |
| 532 | errln("en_us->equals(null) threw " + e.toString()); |
| 533 | } |
| 534 | */ |
| 535 | } |
| 536 | |
| 537 | // @bug 4078588 |
| 538 | // |
| 539 | // RuleBasedCollator breaks on "< a < bb" rule |
| 540 | // |
| 541 | void CollationRegressionTest::Test4078588(/* char *par */) |
| 542 | { |
| 543 | UErrorCode status = U_ZERO_ERROR; |
| 544 | RuleBasedCollator *rbc = new RuleBasedCollator("&9 < a < bb", status); |
| 545 | |
| 546 | if (rbc == NULL || U_FAILURE(status)) |
| 547 | { |
| 548 | errln("Failed to create RuleBasedCollator."); |
| 549 | delete rbc; |
| 550 | return; |
| 551 | } |
| 552 | |
| 553 | Collator::EComparisonResult result = rbc->compare("a","bb"); |
| 554 | |
| 555 | if (result != Collator::LESS) |
| 556 | { |
| 557 | errln((UnicodeString)"Compare(a,bb) returned " + (int)result |
| 558 | + (UnicodeString)"; expected -1"); |
| 559 | } |
| 560 | |
| 561 | delete rbc; |
| 562 | } |
| 563 | |
| 564 | // @bug 4081866 |
| 565 | // |
| 566 | // Combining characters in different classes not reordered properly. |
| 567 | // |
| 568 | void CollationRegressionTest::Test4081866(/* char* par */) |
| 569 | { |
| 570 | // These combining characters are all in different classes, |
| 571 | // so they should be reordered and the strings should compare as equal. |
| 572 | static const UChar s1[] = {0x41, 0x0300, 0x0316, 0x0327, 0x0315, 0}; |
| 573 | static const UChar s2[] = {0x41, 0x0327, 0x0316, 0x0315, 0x0300, 0}; |
| 574 | |
| 575 | UErrorCode status = U_ZERO_ERROR; |
| 576 | RuleBasedCollator *c = en_us->clone(); |
| 577 | c->setStrength(Collator::TERTIARY); |
| 578 | |
| 579 | // Now that the default collators are set to NO_DECOMPOSITION |
| 580 | // (as a result of fixing bug 4114077), we must set it explicitly |
| 581 | // when we're testing reordering behavior. -- lwerner, 5/5/98 |
| 582 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 583 | |
| 584 | if (c->compare(s1,s2) != 0) |
| 585 | { |
| 586 | errln("Combining chars were not reordered"); |
| 587 | } |
| 588 | |
| 589 | delete c; |
| 590 | } |
| 591 | |
| 592 | // @bug 4087241 |
| 593 | // |
| 594 | // string comparison errors in Scandinavian collators |
| 595 | // |
| 596 | void CollationRegressionTest::Test4087241(/* char* par */) |
| 597 | { |
| 598 | UErrorCode status = U_ZERO_ERROR; |
| 599 | Locale da_DK("da", "DK"); |
| 600 | RuleBasedCollator *c = NULL; |
| 601 | |
| 602 | c = (RuleBasedCollator *) Collator::createInstance(da_DK, status); |
| 603 | |
| 604 | if (c == NULL || U_FAILURE(status)) |
| 605 | { |
| 606 | errln("Failed to create collator for da_DK locale"); |
| 607 | delete c; |
| 608 | return; |
| 609 | } |
| 610 | |
| 611 | c->setStrength(Collator::SECONDARY); |
| 612 | |
| 613 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 614 | { |
| 615 | {0x7a, 0}, {0x3c, 0}, {0x00E6, 0}, // z < ae |
| 616 | {0x61, 0x0308, 0}, {0x3c, 0}, {0x61, 0x030A, 0}, // a-umlaut < a-ring |
| 617 | {0x59, 0}, {0x3c, 0}, {0x75, 0x0308, 0}, // Y < u-umlaut |
| 618 | }; |
| 619 | |
| 620 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 621 | |
| 622 | delete c; |
| 623 | } |
| 624 | |
| 625 | // @bug 4087243 |
| 626 | // |
| 627 | // CollationKey takes ignorable strings into account when it shouldn't |
| 628 | // |
| 629 | void CollationRegressionTest::Test4087243(/* char* par */) |
| 630 | { |
| 631 | RuleBasedCollator *c = en_us->clone(); |
| 632 | c->setStrength(Collator::TERTIARY); |
| 633 | |
| 634 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 635 | { |
| 636 | {0x31, 0x32, 0x33, 0}, {0x3d, 0}, {0x31, 0x32, 0x33, 0x0001, 0} // 1 2 3 = 1 2 3 ctrl-A |
| 637 | }; |
| 638 | |
| 639 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 640 | |
| 641 | delete c; |
| 642 | } |
| 643 | |
| 644 | // @bug 4092260 |
| 645 | // |
| 646 | // Mu/micro conflict |
| 647 | // Micro symbol and greek lowercase letter Mu should sort identically |
| 648 | // |
| 649 | void CollationRegressionTest::Test4092260(/* char* par */) |
| 650 | { |
| 651 | UErrorCode status = U_ZERO_ERROR; |
| 652 | Locale el("el", ""); |
| 653 | Collator *c = NULL; |
| 654 | |
| 655 | c = Collator::createInstance(el, status); |
| 656 | |
| 657 | if (c == NULL || U_FAILURE(status)) |
| 658 | { |
| 659 | errln("Failed to create collator for el locale."); |
| 660 | delete c; |
| 661 | return; |
| 662 | } |
| 663 | |
| 664 | // These now have tertiary differences in UCA |
| 665 | c->setAttribute(UCOL_STRENGTH, UCOL_SECONDARY, status); |
| 666 | |
| 667 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 668 | { |
| 669 | {0x00B5, 0}, {0x3d, 0}, {0x03BC, 0} |
| 670 | }; |
| 671 | |
| 672 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 673 | |
| 674 | delete c; |
| 675 | } |
| 676 | |
| 677 | // @bug 4095316 |
| 678 | // |
| 679 | void CollationRegressionTest::Test4095316(/* char* par */) |
| 680 | { |
| 681 | UErrorCode status = U_ZERO_ERROR; |
| 682 | Locale el_GR("el", "GR"); |
| 683 | Collator *c = Collator::createInstance(el_GR, status); |
| 684 | |
| 685 | if (c == NULL || U_FAILURE(status)) |
| 686 | { |
| 687 | errln("Failed to create collator for el_GR locale"); |
| 688 | delete c; |
| 689 | return; |
| 690 | } |
| 691 | // These now have tertiary differences in UCA |
| 692 | //c->setStrength(Collator::TERTIARY); |
| 693 | c->setAttribute(UCOL_STRENGTH, UCOL_SECONDARY, status); |
| 694 | |
| 695 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 696 | { |
| 697 | {0x03D4, 0}, {0x3d, 0}, {0x03AB, 0} |
| 698 | }; |
| 699 | |
| 700 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 701 | |
| 702 | delete c; |
| 703 | } |
| 704 | |
| 705 | // @bug 4101940 |
| 706 | // |
| 707 | void CollationRegressionTest::Test4101940(/* char* par */) |
| 708 | { |
| 709 | UErrorCode status = U_ZERO_ERROR; |
| 710 | RuleBasedCollator *c = NULL; |
| 711 | UnicodeString rules = "&9 < a < b"; |
| 712 | UnicodeString nothing = ""; |
| 713 | |
| 714 | c = new RuleBasedCollator(rules, status); |
| 715 | |
| 716 | if (c == NULL || U_FAILURE(status)) |
| 717 | { |
| 718 | errln("Failed to create RuleBasedCollator"); |
| 719 | delete c; |
| 720 | return; |
| 721 | } |
| 722 | |
| 723 | CollationElementIterator *i = c->createCollationElementIterator(nothing); |
| 724 | i->reset(); |
| 725 | |
| 726 | if (i->next(status) != CollationElementIterator::NULLORDER) |
| 727 | { |
| 728 | errln("next did not return NULLORDER"); |
| 729 | } |
| 730 | |
| 731 | delete i; |
| 732 | delete c; |
| 733 | } |
| 734 | |
| 735 | // @bug 4103436 |
| 736 | // |
| 737 | // Collator::compare not handling spaces properly |
| 738 | // |
| 739 | void CollationRegressionTest::Test4103436(/* char* par */) |
| 740 | { |
| 741 | RuleBasedCollator *c = en_us->clone(); |
| 742 | c->setStrength(Collator::TERTIARY); |
| 743 | |
| 744 | static const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 745 | { |
| 746 | {0x66, 0x69, 0x6c, 0x65, 0}, {0x3c, 0}, {0x66, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0}, |
| 747 | {0x66, 0x69, 0x6c, 0x65, 0}, {0x3c, 0}, {0x66, 0x69, 0x6c, 0x65, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0} |
| 748 | }; |
| 749 | |
| 750 | compareArray(*c, tests, UPRV_LENGTHOF(tests)); |
| 751 | |
| 752 | delete c; |
| 753 | } |
| 754 | |
| 755 | // @bug 4114076 |
| 756 | // |
| 757 | // Collation not Unicode conformant with Hangul syllables |
| 758 | // |
| 759 | void CollationRegressionTest::Test4114076(/* char* par */) |
| 760 | { |
| 761 | UErrorCode status = U_ZERO_ERROR; |
| 762 | RuleBasedCollator *c = en_us->clone(); |
| 763 | c->setStrength(Collator::TERTIARY); |
| 764 | |
| 765 | // |
| 766 | // With Canonical decomposition, Hangul syllables should get decomposed |
| 767 | // into Jamo, but Jamo characters should not be decomposed into |
| 768 | // conjoining Jamo |
| 769 | // |
| 770 | static const UChar test1[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 771 | { |
| 772 | {0xd4db, 0}, {0x3d, 0}, {0x1111, 0x1171, 0x11b6, 0} |
| 773 | }; |
| 774 | |
| 775 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 776 | compareArray(*c, test1, UPRV_LENGTHOF(test1)); |
| 777 | |
| 778 | // From UTR #15: |
| 779 | // *In earlier versions of Unicode, jamo characters like ksf |
| 780 | // had compatibility mappings to kf + sf. These mappings were |
| 781 | // removed in Unicode 2.1.9 to ensure that Hangul syllables are maintained.) |
| 782 | // That is, the following test is obsolete as of 2.1.9 |
| 783 | |
| 784 | //obsolete- // With Full decomposition, it should go all the way down to |
| 785 | //obsolete- // conjoining Jamo characters. |
| 786 | //obsolete- // |
| 787 | //obsolete- static const UChar test2[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 788 | //obsolete- { |
| 789 | //obsolete- {0xd4db, 0}, {0x3d, 0}, {0x1111, 0x116e, 0x1175, 0x11af, 0x11c2, 0} |
| 790 | //obsolete- }; |
| 791 | //obsolete- |
| 792 | //obsolete- c->setDecomposition(Normalizer::DECOMP_COMPAT); |
| 793 | //obsolete- compareArray(*c, test2, UPRV_LENGTHOF(test2)); |
| 794 | |
| 795 | delete c; |
| 796 | } |
| 797 | |
| 798 | |
| 799 | // @bug 4124632 |
| 800 | // |
| 801 | // Collator::getCollationKey was hanging on certain character sequences |
| 802 | // |
| 803 | void CollationRegressionTest::Test4124632(/* char* par */) |
| 804 | { |
| 805 | UErrorCode status = U_ZERO_ERROR; |
| 806 | Collator *coll = NULL; |
| 807 | |
| 808 | coll = Collator::createInstance(Locale::getJapan(), status); |
| 809 | |
| 810 | if (coll == NULL || U_FAILURE(status)) |
| 811 | { |
| 812 | errln("Failed to create collator for Locale::JAPAN"); |
| 813 | delete coll; |
| 814 | return; |
| 815 | } |
| 816 | |
| 817 | static const UChar test[] = {0x41, 0x0308, 0x62, 0x63, 0}; |
| 818 | CollationKey key; |
| 819 | |
| 820 | coll->getCollationKey(test, key, status); |
| 821 | |
| 822 | if (key.isBogus() || U_FAILURE(status)) |
| 823 | { |
| 824 | errln("CollationKey creation failed."); |
| 825 | } |
| 826 | |
| 827 | delete coll; |
| 828 | } |
| 829 | |
| 830 | // @bug 4132736 |
| 831 | // |
| 832 | // sort order of french words with multiple accents has errors |
| 833 | // |
| 834 | void CollationRegressionTest::Test4132736(/* char* par */) |
| 835 | { |
| 836 | UErrorCode status = U_ZERO_ERROR; |
| 837 | |
| 838 | Collator *c = NULL; |
| 839 | |
| 840 | c = Collator::createInstance(Locale::getCanadaFrench(), status); |
| 841 | c->setStrength(Collator::TERTIARY); |
| 842 | |
| 843 | if (c == NULL || U_FAILURE(status)) |
| 844 | { |
| 845 | errln("Failed to create a collator for Locale::getCanadaFrench()"); |
| 846 | delete c; |
| 847 | return; |
| 848 | } |
| 849 | |
| 850 | static const UChar test1[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 851 | { |
| 852 | {0x65, 0x0300, 0x65, 0x0301, 0}, {0x3c, 0}, {0x65, 0x0301, 0x65, 0x0300, 0}, |
| 853 | {0x65, 0x0300, 0x0301, 0}, {0x3c, 0}, {0x65, 0x0301, 0x0300, 0} |
| 854 | }; |
| 855 | |
| 856 | compareArray(*c, test1, UPRV_LENGTHOF(test1)); |
| 857 | |
| 858 | delete c; |
| 859 | } |
| 860 | |
| 861 | // @bug 4133509 |
| 862 | // |
| 863 | // The sorting using java.text.CollationKey is not in the exact order |
| 864 | // |
| 865 | void CollationRegressionTest::Test4133509(/* char* par */) |
| 866 | { |
| 867 | static const UChar test1[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 868 | { |
| 869 | {0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0}, {0x3c, 0}, {0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x49, 0x6e, 0x69, 0x74, 0x69, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x45, 0x72, 0x72, 0x6f, 0x72, 0}, |
| 870 | {0x47, 0x72, 0x61, 0x70, 0x68, 0x69, 0x63, 0x73, 0}, {0x3c, 0}, {0x47, 0x72, 0x61, 0x70, 0x68, 0x69, 0x63, 0x73, 0x45, 0x6e, 0x76, 0x69, 0x72, 0x6f, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0}, |
| 871 | {0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0}, {0x3c, 0}, {0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0} |
| 872 | }; |
| 873 | |
| 874 | compareArray(*en_us, test1, UPRV_LENGTHOF(test1)); |
| 875 | } |
| 876 | |
| 877 | // @bug 4114077 |
| 878 | // |
| 879 | // Collation with decomposition off doesn't work for Europe |
| 880 | // |
| 881 | void CollationRegressionTest::Test4114077(/* char* par */) |
| 882 | { |
| 883 | // Ensure that we get the same results with decomposition off |
| 884 | // as we do with it on.... |
| 885 | |
| 886 | UErrorCode status = U_ZERO_ERROR; |
| 887 | RuleBasedCollator *c = en_us->clone(); |
| 888 | c->setStrength(Collator::TERTIARY); |
| 889 | |
| 890 | static const UChar test1[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 891 | { |
| 892 | {0x00C0, 0}, {0x3d, 0}, {0x41, 0x0300, 0}, // Should be equivalent |
| 893 | {0x70, 0x00ea, 0x63, 0x68, 0x65, 0}, {0x3e, 0}, {0x70, 0x00e9, 0x63, 0x68, 0x00e9, 0}, |
| 894 | {0x0204, 0}, {0x3d, 0}, {0x45, 0x030F, 0}, |
| 895 | {0x01fa, 0}, {0x3d, 0}, {0x41, 0x030a, 0x0301, 0}, // a-ring-acute -> a-ring, acute |
| 896 | // -> a, ring, acute |
| 897 | {0x41, 0x0300, 0x0316, 0}, {0x3c, 0}, {0x41, 0x0316, 0x0300, 0} // No reordering --> unequal |
| 898 | }; |
| 899 | |
| 900 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, status); |
| 901 | compareArray(*c, test1, UPRV_LENGTHOF(test1)); |
| 902 | |
| 903 | static const UChar test2[][CollationRegressionTest::MAX_TOKEN_LEN] = |
| 904 | { |
| 905 | {0x41, 0x0300, 0x0316, 0}, {0x3d, 0}, {0x41, 0x0316, 0x0300, 0} // Reordering --> equal |
| 906 | }; |
| 907 | |
| 908 | c->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 909 | compareArray(*c, test2, UPRV_LENGTHOF(test2)); |
| 910 | |
| 911 | delete c; |
| 912 | } |
| 913 | |
| 914 | // @bug 4141640 |
| 915 | // |
| 916 | // Support for Swedish gone in 1.1.6 (Can't create Swedish collator) |
| 917 | // |
| 918 | void CollationRegressionTest::Test4141640(/* char* par */) |
| 919 | { |
| 920 | // |
| 921 | // Rather than just creating a Swedish collator, we might as well |
| 922 | // try to instantiate one for every locale available on the system |
| 923 | // in order to prevent this sort of bug from cropping up in the future |
| 924 | // |
| 925 | UErrorCode status = U_ZERO_ERROR; |
| 926 | int32_t i, localeCount; |
| 927 | const Locale *locales = Locale::getAvailableLocales(localeCount); |
| 928 | |
| 929 | for (i = 0; i < localeCount; i += 1) |
| 930 | { |
| 931 | Collator *c = NULL; |
| 932 | |
| 933 | status = U_ZERO_ERROR; |
| 934 | c = Collator::createInstance(locales[i], status); |
| 935 | |
| 936 | if (c == NULL || U_FAILURE(status)) |
| 937 | { |
| 938 | UnicodeString msg, localeName; |
| 939 | |
| 940 | msg += "Could not create collator for locale "; |
| 941 | msg += locales[i].getName(); |
| 942 | |
| 943 | errln(msg); |
| 944 | } |
| 945 | |
| 946 | delete c; |
| 947 | } |
| 948 | } |
| 949 | |
| 950 | // @bug 4139572 |
| 951 | // |
| 952 | // getCollationKey throws exception for spanish text |
| 953 | // Cannot reproduce this bug on 1.2, however it DOES fail on 1.1.6 |
| 954 | // |
| 955 | void CollationRegressionTest::Test4139572(/* char* par */) |
| 956 | { |
| 957 | // |
| 958 | // Code pasted straight from the bug report |
| 959 | // (and then translated to C++ ;-) |
| 960 | // |
| 961 | // create spanish locale and collator |
| 962 | UErrorCode status = U_ZERO_ERROR; |
| 963 | Locale l("es", "es"); |
| 964 | Collator *col = NULL; |
| 965 | |
| 966 | col = Collator::createInstance(l, status); |
| 967 | |
| 968 | if (col == NULL || U_FAILURE(status)) |
| 969 | { |
| 970 | errln("Failed to create a collator for es_es locale."); |
| 971 | delete col; |
| 972 | return; |
| 973 | } |
| 974 | |
| 975 | CollationKey key; |
| 976 | |
| 977 | // this spanish phrase kills it! |
| 978 | col->getCollationKey("Nombre De Objeto", key, status); |
| 979 | |
| 980 | if (key.isBogus() || U_FAILURE(status)) |
| 981 | { |
| 982 | errln("Error creating CollationKey for \"Nombre De Ojbeto\""); |
| 983 | } |
| 984 | |
| 985 | delete col; |
| 986 | } |
| 987 | |
| 988 | void CollationRegressionTest::Test4179216() { |
| 989 | // you can position a CollationElementIterator in the middle of |
| 990 | // a contracting character sequence, yielding a bogus collation |
| 991 | // element |
| 992 | IcuTestErrorCode errorCode(*this, "Test4179216"); |
| 993 | RuleBasedCollator coll(en_us->getRules() + " & C < ch , cH , Ch , CH < cat < crunchy", errorCode); |
| 994 | UnicodeString testText = "church church catcatcher runcrunchynchy"; |
| 995 | CollationElementIterator *iter = coll.createCollationElementIterator(testText); |
| 996 | |
| 997 | // test that the "ch" combination works properly |
| 998 | iter->setOffset(4, errorCode); |
| 999 | int32_t elt4 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1000 | |
| 1001 | iter->reset(); |
| 1002 | int32_t elt0 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1003 | |
| 1004 | iter->setOffset(5, errorCode); |
| 1005 | int32_t elt5 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1006 | |
| 1007 | // Compares and prints only 16-bit primary weights. |
| 1008 | if (elt4 != elt0 || elt5 != elt0) { |
| 1009 | errln("The collation elements at positions 0 (0x%04x), " |
| 1010 | "4 (0x%04x), and 5 (0x%04x) don't match.", |
| 1011 | elt0, elt4, elt5); |
| 1012 | } |
| 1013 | |
| 1014 | // test that the "cat" combination works properly |
| 1015 | iter->setOffset(14, errorCode); |
| 1016 | int32_t elt14 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1017 | |
| 1018 | iter->setOffset(15, errorCode); |
| 1019 | int32_t elt15 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1020 | |
| 1021 | iter->setOffset(16, errorCode); |
| 1022 | int32_t elt16 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1023 | |
| 1024 | iter->setOffset(17, errorCode); |
| 1025 | int32_t elt17 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1026 | |
| 1027 | iter->setOffset(18, errorCode); |
| 1028 | int32_t elt18 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1029 | |
| 1030 | iter->setOffset(19, errorCode); |
| 1031 | int32_t elt19 = CollationElementIterator::primaryOrder(iter->next(errorCode)); |
| 1032 | |
| 1033 | // Compares and prints only 16-bit primary weights. |
| 1034 | if (elt14 != elt15 || elt14 != elt16 || elt14 != elt17 |
| 1035 | || elt14 != elt18 || elt14 != elt19) { |
| 1036 | errln("\"cat\" elements don't match: elt14 = 0x%04x, " |
| 1037 | "elt15 = 0x%04x, elt16 = 0x%04x, elt17 = 0x%04x, " |
| 1038 | "elt18 = 0x%04x, elt19 = 0x%04x", |
| 1039 | elt14, elt15, elt16, elt17, elt18, elt19); |
| 1040 | } |
| 1041 | |
| 1042 | // now generate a complete list of the collation elements, |
| 1043 | // first using next() and then using setOffset(), and |
| 1044 | // make sure both interfaces return the same set of elements |
| 1045 | iter->reset(); |
| 1046 | |
| 1047 | int32_t elt = iter->next(errorCode); |
| 1048 | int32_t count = 0; |
| 1049 | while (elt != CollationElementIterator::NULLORDER) { |
| 1050 | ++count; |
| 1051 | elt = iter->next(errorCode); |
| 1052 | } |
| 1053 | |
| 1054 | LocalArray<UnicodeString> nextElements(new UnicodeString[count]); |
| 1055 | LocalArray<UnicodeString> setOffsetElements(new UnicodeString[count]); |
| 1056 | int32_t lastPos = 0; |
| 1057 | |
| 1058 | iter->reset(); |
| 1059 | elt = iter->next(errorCode); |
| 1060 | count = 0; |
| 1061 | while (elt != CollationElementIterator::NULLORDER) { |
| 1062 | nextElements[count++] = testText.tempSubStringBetween(lastPos, iter->getOffset()); |
| 1063 | lastPos = iter->getOffset(); |
| 1064 | elt = iter->next(errorCode); |
| 1065 | } |
| 1066 | int32_t nextElementsLength = count; |
| 1067 | count = 0; |
| 1068 | for (int32_t i = 0; i < testText.length(); ) { |
| 1069 | iter->setOffset(i, errorCode); |
| 1070 | lastPos = iter->getOffset(); |
| 1071 | elt = iter->next(errorCode); |
| 1072 | setOffsetElements[count++] = testText.tempSubStringBetween(lastPos, iter->getOffset()); |
| 1073 | i = iter->getOffset(); |
| 1074 | } |
| 1075 | for (int32_t i = 0; i < nextElementsLength; i++) { |
| 1076 | if (nextElements[i] == setOffsetElements[i]) { |
| 1077 | logln(nextElements[i]); |
| 1078 | } else { |
| 1079 | errln(UnicodeString("Error: next() yielded ") + nextElements[i] + |
| 1080 | ", but setOffset() yielded " + setOffsetElements[i]); |
| 1081 | } |
| 1082 | } |
| 1083 | delete iter; |
| 1084 | } |
| 1085 | |
| 1086 | // Ticket 7189 |
| 1087 | // |
| 1088 | // nextSortKeyPart incorrect for EO_S1 collation |
| 1089 | static int32_t calcKeyIncremental(UCollator *coll, const UChar* text, int32_t len, uint8_t *keyBuf, int32_t /*keyBufLen*/, UErrorCode& status) { |
| 1090 | UCharIterator uiter; |
| 1091 | uint32_t state[2] = { 0, 0 }; |
| 1092 | int32_t keyLen; |
| 1093 | int32_t count = 8; |
| 1094 | |
| 1095 | uiter_setString(&uiter, text, len); |
| 1096 | keyLen = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1097 | while (true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1098 | int32_t keyPartLen = ucol_nextSortKeyPart(coll, &uiter, state, &keyBuf[keyLen], count, &status); |
| 1099 | if (U_FAILURE(status)) { |
| 1100 | return -1; |
| 1101 | } |
| 1102 | if (keyPartLen == 0) { |
| 1103 | break; |
| 1104 | } |
| 1105 | keyLen += keyPartLen; |
| 1106 | } |
| 1107 | return keyLen; |
| 1108 | } |
| 1109 | |
| 1110 | void CollationRegressionTest::TestT7189() { |
| 1111 | UErrorCode status = U_ZERO_ERROR; |
| 1112 | UCollator *coll; |
| 1113 | uint32_t i; |
| 1114 | |
| 1115 | static const UChar text1[][CollationRegressionTest::MAX_TOKEN_LEN] = { |
| 1116 | // "Achter De Hoven" |
| 1117 | { 0x41, 0x63, 0x68, 0x74, 0x65, 0x72, 0x20, 0x44, 0x65, 0x20, 0x48, 0x6F, 0x76, 0x65, 0x6E, 0x00 }, |
| 1118 | // "ABC" |
| 1119 | { 0x41, 0x42, 0x43, 0x00 }, |
| 1120 | // "HELLO world!" |
| 1121 | { 0x48, 0x45, 0x4C, 0x4C, 0x4F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, 0x00 } |
| 1122 | }; |
| 1123 | |
| 1124 | static const UChar text2[][CollationRegressionTest::MAX_TOKEN_LEN] = { |
| 1125 | // "Achter de Hoven" |
| 1126 | { 0x41, 0x63, 0x68, 0x74, 0x65, 0x72, 0x20, 0x64, 0x65, 0x20, 0x48, 0x6F, 0x76, 0x65, 0x6E, 0x00 }, |
| 1127 | // "abc" |
| 1128 | { 0x61, 0x62, 0x63, 0x00 }, |
| 1129 | // "hello world!" |
| 1130 | { 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, 0x00 } |
| 1131 | }; |
| 1132 | |
| 1133 | // Open the collator |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1134 | coll = ucol_openFromShortString("EO_S1", false, NULL, &status); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1135 | if (U_FAILURE(status)) { |
| 1136 | errln("Failed to create a collator for short string EO_S1"); |
| 1137 | return; |
| 1138 | } |
| 1139 | |
| 1140 | for (i = 0; i < UPRV_LENGTHOF(text1); i++) { |
| 1141 | uint8_t key1[100], key2[100]; |
| 1142 | int32_t len1, len2; |
| 1143 | |
| 1144 | len1 = calcKeyIncremental(coll, text1[i], -1, key1, sizeof(key1), status); |
| 1145 | if (U_FAILURE(status)) { |
| 1146 | errln(UnicodeString("Failed to get a partial collation key for ") + text1[i]); |
| 1147 | break; |
| 1148 | } |
| 1149 | len2 = calcKeyIncremental(coll, text2[i], -1, key2, sizeof(key2), status); |
| 1150 | if (U_FAILURE(status)) { |
| 1151 | errln(UnicodeString("Failed to get a partial collation key for ") + text2[i]); |
| 1152 | break; |
| 1153 | } |
| 1154 | |
| 1155 | if (len1 == len2 && uprv_memcmp(key1, key2, len1) == 0) { |
| 1156 | errln(UnicodeString("Failed: Identical key\n") + " text1: " + text1[i] + "\n" + " text2: " + text2[i] + "\n" + " key : " + TestUtility::hex(key1, len1)); |
| 1157 | } else { |
| 1158 | logln(UnicodeString("Keys produced -\n") + " text1: " + text1[i] + "\n" + " key1 : " + TestUtility::hex(key1, len1) + "\n" + " text2: " + text2[i] + "\n" + " key2 : " |
| 1159 | + TestUtility::hex(key2, len2)); |
| 1160 | } |
| 1161 | } |
| 1162 | ucol_close(coll); |
| 1163 | } |
| 1164 | |
| 1165 | void CollationRegressionTest::TestCaseFirstCompression() { |
| 1166 | RuleBasedCollator *col = en_us->clone(); |
| 1167 | UErrorCode status = U_ZERO_ERROR; |
| 1168 | |
| 1169 | // default |
| 1170 | caseFirstCompressionSub(col, "default"); |
| 1171 | |
| 1172 | // Upper first |
| 1173 | col->setAttribute(UCOL_CASE_FIRST, UCOL_UPPER_FIRST, status); |
| 1174 | if (U_FAILURE(status)) { |
| 1175 | errln("Failed to set UCOL_UPPER_FIRST"); |
| 1176 | return; |
| 1177 | } |
| 1178 | caseFirstCompressionSub(col, "upper first"); |
| 1179 | |
| 1180 | // Lower first |
| 1181 | col->setAttribute(UCOL_CASE_FIRST, UCOL_LOWER_FIRST, status); |
| 1182 | if (U_FAILURE(status)) { |
| 1183 | errln("Failed to set UCOL_LOWER_FIRST"); |
| 1184 | return; |
| 1185 | } |
| 1186 | caseFirstCompressionSub(col, "lower first"); |
| 1187 | |
| 1188 | delete col; |
| 1189 | } |
| 1190 | |
| 1191 | void CollationRegressionTest::caseFirstCompressionSub(Collator *col, UnicodeString opt) { |
| 1192 | const int32_t maxLength = 50; |
| 1193 | |
| 1194 | UChar str1[maxLength]; |
| 1195 | UChar str2[maxLength]; |
| 1196 | |
| 1197 | CollationKey key1, key2; |
| 1198 | |
| 1199 | for (int32_t len = 1; len <= maxLength; len++) { |
| 1200 | int32_t i = 0; |
| 1201 | for (; i < len - 1; i++) { |
| 1202 | str1[i] = str2[i] = (UChar)0x61; // 'a' |
| 1203 | } |
| 1204 | str1[i] = (UChar)0x41; // 'A' |
| 1205 | str2[i] = (UChar)0x61; // 'a' |
| 1206 | |
| 1207 | UErrorCode status = U_ZERO_ERROR; |
| 1208 | col->getCollationKey(str1, len, key1, status); |
| 1209 | col->getCollationKey(str2, len, key2, status); |
| 1210 | |
| 1211 | UCollationResult cmpKey = key1.compareTo(key2, status); |
| 1212 | UCollationResult cmpCol = col->compare(str1, len, str2, len, status); |
| 1213 | |
| 1214 | if (U_FAILURE(status)) { |
| 1215 | errln("Error in caseFirstCompressionSub"); |
| 1216 | } else if (cmpKey != cmpCol) { |
| 1217 | errln((UnicodeString)"Inconsistent comparison(" + opt |
| 1218 | + "): str1=" + UnicodeString(str1, len) + ", str2=" + UnicodeString(str2, len) |
| 1219 | + ", cmpKey=" + cmpKey + ", cmpCol=" + cmpCol); |
| 1220 | } |
| 1221 | } |
| 1222 | } |
| 1223 | |
| 1224 | void CollationRegressionTest::TestTrailingComment() { |
| 1225 | // ICU ticket #8070: |
| 1226 | // Check that the rule parser handles a comment without terminating end-of-line. |
| 1227 | IcuTestErrorCode errorCode(*this, "TestTrailingComment"); |
| 1228 | RuleBasedCollator coll(UNICODE_STRING_SIMPLE("&c<b#comment1\n<a#comment2"), errorCode); |
| 1229 | UnicodeString a((UChar)0x61), b((UChar)0x62), c((UChar)0x63); |
| 1230 | assertTrue("c<b", coll.compare(c, b) < 0); |
| 1231 | assertTrue("b<a", coll.compare(b, a) < 0); |
| 1232 | } |
| 1233 | |
| 1234 | void CollationRegressionTest::TestBeforeWithTooStrongAfter() { |
| 1235 | // ICU ticket #9959: |
| 1236 | // Forbid rules with a before-reset followed by a stronger relation. |
| 1237 | IcuTestErrorCode errorCode(*this, "TestBeforeWithTooStrongAfter"); |
| 1238 | RuleBasedCollator before2(UNICODE_STRING_SIMPLE("&[before 2]x<<q<p"), errorCode); |
| 1239 | if(errorCode.isSuccess()) { |
| 1240 | errln("should forbid before-2-reset followed by primary relation"); |
| 1241 | } else { |
| 1242 | errorCode.reset(); |
| 1243 | } |
| 1244 | RuleBasedCollator before3(UNICODE_STRING_SIMPLE("&[before 3]x<<<q<<s<p"), errorCode); |
| 1245 | if(errorCode.isSuccess()) { |
| 1246 | errln("should forbid before-3-reset followed by primary or secondary relation"); |
| 1247 | } else { |
| 1248 | errorCode.reset(); |
| 1249 | } |
| 1250 | } |
| 1251 | |
| 1252 | void CollationRegressionTest::compareArray(Collator &c, |
| 1253 | const UChar tests[][CollationRegressionTest::MAX_TOKEN_LEN], |
| 1254 | int32_t testCount) |
| 1255 | { |
| 1256 | int32_t i; |
| 1257 | Collator::EComparisonResult expectedResult = Collator::EQUAL; |
| 1258 | |
| 1259 | for (i = 0; i < testCount; i += 3) |
| 1260 | { |
| 1261 | UnicodeString source(tests[i]); |
| 1262 | UnicodeString comparison(tests[i + 1]); |
| 1263 | UnicodeString target(tests[i + 2]); |
| 1264 | |
| 1265 | if (comparison == "<") |
| 1266 | { |
| 1267 | expectedResult = Collator::LESS; |
| 1268 | } |
| 1269 | else if (comparison == ">") |
| 1270 | { |
| 1271 | expectedResult = Collator::GREATER; |
| 1272 | } |
| 1273 | else if (comparison == "=") |
| 1274 | { |
| 1275 | expectedResult = Collator::EQUAL; |
| 1276 | } |
| 1277 | else |
| 1278 | { |
| 1279 | UnicodeString bogus1("Bogus comparison string \""); |
| 1280 | UnicodeString bogus2("\""); |
| 1281 | errln(bogus1 + comparison + bogus2); |
| 1282 | } |
| 1283 | |
| 1284 | Collator::EComparisonResult compareResult = c.compare(source, target); |
| 1285 | |
| 1286 | CollationKey sourceKey, targetKey; |
| 1287 | UErrorCode status = U_ZERO_ERROR; |
| 1288 | |
| 1289 | c.getCollationKey(source, sourceKey, status); |
| 1290 | |
| 1291 | if (U_FAILURE(status)) |
| 1292 | { |
| 1293 | errln("Couldn't get collationKey for source"); |
| 1294 | continue; |
| 1295 | } |
| 1296 | |
| 1297 | c.getCollationKey(target, targetKey, status); |
| 1298 | |
| 1299 | if (U_FAILURE(status)) |
| 1300 | { |
| 1301 | errln("Couldn't get collationKey for target"); |
| 1302 | continue; |
| 1303 | } |
| 1304 | |
| 1305 | Collator::EComparisonResult keyResult = sourceKey.compareTo(targetKey); |
| 1306 | |
| 1307 | reportCResult( source, target, sourceKey, targetKey, compareResult, keyResult, compareResult, expectedResult ); |
| 1308 | |
| 1309 | } |
| 1310 | } |
| 1311 | |
| 1312 | void CollationRegressionTest::assertEqual(CollationElementIterator &i1, CollationElementIterator &i2) |
| 1313 | { |
| 1314 | int32_t c1, c2, count = 0; |
| 1315 | UErrorCode status = U_ZERO_ERROR; |
| 1316 | |
| 1317 | do |
| 1318 | { |
| 1319 | c1 = i1.next(status); |
| 1320 | c2 = i2.next(status); |
| 1321 | |
| 1322 | if (c1 != c2) |
| 1323 | { |
| 1324 | UnicodeString msg, msg1(" "); |
| 1325 | |
| 1326 | msg += msg1 + count; |
| 1327 | msg += ": strength(0x"; |
| 1328 | appendHex(c1, 8, msg); |
| 1329 | msg += ") != strength(0x"; |
| 1330 | appendHex(c2, 8, msg); |
| 1331 | msg += ")"; |
| 1332 | |
| 1333 | errln(msg); |
| 1334 | break; |
| 1335 | } |
| 1336 | |
| 1337 | count += 1; |
| 1338 | } |
| 1339 | while (c1 != CollationElementIterator::NULLORDER); |
| 1340 | } |
| 1341 | |
| 1342 | void CollationRegressionTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /* par */) |
| 1343 | { |
| 1344 | if (exec) |
| 1345 | { |
| 1346 | logln("Collation Regression Tests: "); |
| 1347 | } |
| 1348 | |
| 1349 | if(en_us == NULL) { |
| 1350 | dataerrln("Class collator not instantiated"); |
| 1351 | name = ""; |
| 1352 | return; |
| 1353 | } |
| 1354 | TESTCASE_AUTO_BEGIN; |
| 1355 | TESTCASE_AUTO(Test4048446); |
| 1356 | TESTCASE_AUTO(Test4051866); |
| 1357 | TESTCASE_AUTO(Test4053636); |
| 1358 | TESTCASE_AUTO(Test4054238); |
| 1359 | TESTCASE_AUTO(Test4054734); |
| 1360 | TESTCASE_AUTO(Test4054736); |
| 1361 | TESTCASE_AUTO(Test4058613); |
| 1362 | TESTCASE_AUTO(Test4059820); |
| 1363 | TESTCASE_AUTO(Test4060154); |
| 1364 | TESTCASE_AUTO(Test4062418); |
| 1365 | TESTCASE_AUTO(Test4065540); |
| 1366 | TESTCASE_AUTO(Test4066189); |
| 1367 | TESTCASE_AUTO(Test4066696); |
| 1368 | TESTCASE_AUTO(Test4076676); |
| 1369 | TESTCASE_AUTO(Test4078588); |
| 1370 | TESTCASE_AUTO(Test4079231); |
| 1371 | TESTCASE_AUTO(Test4081866); |
| 1372 | TESTCASE_AUTO(Test4087241); |
| 1373 | TESTCASE_AUTO(Test4087243); |
| 1374 | TESTCASE_AUTO(Test4092260); |
| 1375 | TESTCASE_AUTO(Test4095316); |
| 1376 | TESTCASE_AUTO(Test4101940); |
| 1377 | TESTCASE_AUTO(Test4103436); |
| 1378 | TESTCASE_AUTO(Test4114076); |
| 1379 | TESTCASE_AUTO(Test4114077); |
| 1380 | TESTCASE_AUTO(Test4124632); |
| 1381 | TESTCASE_AUTO(Test4132736); |
| 1382 | TESTCASE_AUTO(Test4133509); |
| 1383 | TESTCASE_AUTO(Test4139572); |
| 1384 | TESTCASE_AUTO(Test4141640); |
| 1385 | TESTCASE_AUTO(Test4179216); |
| 1386 | TESTCASE_AUTO(TestT7189); |
| 1387 | TESTCASE_AUTO(TestCaseFirstCompression); |
| 1388 | TESTCASE_AUTO(TestTrailingComment); |
| 1389 | TESTCASE_AUTO(TestBeforeWithTooStrongAfter); |
| 1390 | TESTCASE_AUTO_END; |
| 1391 | } |
| 1392 | |
| 1393 | #endif /* #if !UCONFIG_NO_COLLATION */ |