Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /******************************************************************** |
| 4 | * COPYRIGHT: |
| 5 | * Copyright (c) 1997-2016, International Business Machines Corporation and |
| 6 | * others. All Rights Reserved. |
| 7 | ********************************************************************/ |
| 8 | |
| 9 | #include "unicode/utypes.h" |
| 10 | |
| 11 | #if !UCONFIG_NO_NORMALIZATION |
| 12 | |
| 13 | #include "unicode/uchar.h" |
| 14 | #include "unicode/errorcode.h" |
| 15 | #include "unicode/normlzr.h" |
| 16 | #include "unicode/stringoptions.h" |
| 17 | #include "unicode/stringpiece.h" |
| 18 | #include "unicode/uniset.h" |
| 19 | #include "unicode/usetiter.h" |
| 20 | #include "unicode/schriter.h" |
| 21 | #include "unicode/utf16.h" |
| 22 | #include "cmemory.h" |
| 23 | #include "cstring.h" |
| 24 | #include "normalizer2impl.h" |
| 25 | #include "testutil.h" |
| 26 | #include "tstnorm.h" |
| 27 | |
| 28 | #define ARRAY_LENGTH(array) UPRV_LENGTHOF(array) |
| 29 | |
| 30 | void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec, |
| 31 | const char* &name, char* /*par*/) { |
| 32 | if(exec) { |
| 33 | logln("TestSuite BasicNormalizerTest: "); |
| 34 | } |
| 35 | TESTCASE_AUTO_BEGIN; |
| 36 | TESTCASE_AUTO(TestDecomp); |
| 37 | TESTCASE_AUTO(TestCompatDecomp); |
| 38 | TESTCASE_AUTO(TestCanonCompose); |
| 39 | TESTCASE_AUTO(TestCompatCompose); |
| 40 | TESTCASE_AUTO(TestPrevious); |
| 41 | TESTCASE_AUTO(TestHangulDecomp); |
| 42 | TESTCASE_AUTO(TestHangulCompose); |
| 43 | TESTCASE_AUTO(TestTibetan); |
| 44 | TESTCASE_AUTO(TestCompositionExclusion); |
| 45 | TESTCASE_AUTO(TestZeroIndex); |
| 46 | TESTCASE_AUTO(TestVerisign); |
| 47 | TESTCASE_AUTO(TestPreviousNext); |
| 48 | TESTCASE_AUTO(TestNormalizerAPI); |
| 49 | TESTCASE_AUTO(TestConcatenate); |
| 50 | TESTCASE_AUTO(FindFoldFCDExceptions); |
| 51 | TESTCASE_AUTO(TestCompare); |
| 52 | TESTCASE_AUTO(TestSkippable); |
| 53 | #if !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION |
| 54 | TESTCASE_AUTO(TestCustomComp); |
| 55 | TESTCASE_AUTO(TestCustomFCC); |
| 56 | #endif |
| 57 | TESTCASE_AUTO(TestFilteredNormalizer2Coverage); |
| 58 | TESTCASE_AUTO(TestComposeUTF8WithEdits); |
| 59 | TESTCASE_AUTO(TestDecomposeUTF8WithEdits); |
| 60 | TESTCASE_AUTO(TestLowMappingToEmpty_D); |
| 61 | TESTCASE_AUTO(TestLowMappingToEmpty_FCD); |
| 62 | TESTCASE_AUTO(TestNormalizeIllFormedText); |
| 63 | TESTCASE_AUTO(TestComposeJamoTBase); |
| 64 | TESTCASE_AUTO(TestComposeBoundaryAfter); |
| 65 | TESTCASE_AUTO_END; |
| 66 | } |
| 67 | |
| 68 | /** |
| 69 | * Convert Java-style strings with \u Unicode escapes into UnicodeString objects |
| 70 | */ |
| 71 | static UnicodeString str(const char *input) |
| 72 | { |
| 73 | UnicodeString str(input, ""); // Invariant conversion |
| 74 | return str.unescape(); |
| 75 | } |
| 76 | |
| 77 | |
| 78 | BasicNormalizerTest::BasicNormalizerTest() |
| 79 | { |
| 80 | // canonTest |
| 81 | // Input Decomposed Composed |
| 82 | |
| 83 | canonTests[0][0] = str("cat"); canonTests[0][1] = str("cat"); canonTests[0][2] = str("cat"); |
| 84 | |
| 85 | canonTests[1][0] = str("\\u00e0ardvark"); canonTests[1][1] = str("a\\u0300ardvark"); canonTests[1][2] = str("\\u00e0ardvark"); |
| 86 | |
| 87 | canonTests[2][0] = str("\\u1e0a"); canonTests[2][1] = str("D\\u0307"); canonTests[2][2] = str("\\u1e0a"); // D-dot_above |
| 88 | |
| 89 | canonTests[3][0] = str("D\\u0307"); canonTests[3][1] = str("D\\u0307"); canonTests[3][2] = str("\\u1e0a"); // D dot_above |
| 90 | |
| 91 | canonTests[4][0] = str("\\u1e0c\\u0307"); canonTests[4][1] = str("D\\u0323\\u0307"); canonTests[4][2] = str("\\u1e0c\\u0307"); // D-dot_below dot_above |
| 92 | |
| 93 | canonTests[5][0] = str("\\u1e0a\\u0323"); canonTests[5][1] = str("D\\u0323\\u0307"); canonTests[5][2] = str("\\u1e0c\\u0307"); // D-dot_above dot_below |
| 94 | |
| 95 | canonTests[6][0] = str("D\\u0307\\u0323"); canonTests[6][1] = str("D\\u0323\\u0307"); canonTests[6][2] = str("\\u1e0c\\u0307"); // D dot_below dot_above |
| 96 | |
| 97 | canonTests[7][0] = str("\\u1e10\\u0307\\u0323"); canonTests[7][1] = str("D\\u0327\\u0323\\u0307"); canonTests[7][2] = str("\\u1e10\\u0323\\u0307"); // D dot_below cedilla dot_above |
| 98 | |
| 99 | canonTests[8][0] = str("D\\u0307\\u0328\\u0323"); canonTests[8][1] = str("D\\u0328\\u0323\\u0307"); canonTests[8][2] = str("\\u1e0c\\u0328\\u0307"); // D dot_above ogonek dot_below |
| 100 | |
| 101 | canonTests[9][0] = str("\\u1E14"); canonTests[9][1] = str("E\\u0304\\u0300"); canonTests[9][2] = str("\\u1E14"); // E-macron-grave |
| 102 | |
| 103 | canonTests[10][0] = str("\\u0112\\u0300"); canonTests[10][1] = str("E\\u0304\\u0300"); canonTests[10][2] = str("\\u1E14"); // E-macron + grave |
| 104 | |
| 105 | canonTests[11][0] = str("\\u00c8\\u0304"); canonTests[11][1] = str("E\\u0300\\u0304"); canonTests[11][2] = str("\\u00c8\\u0304"); // E-grave + macron |
| 106 | |
| 107 | canonTests[12][0] = str("\\u212b"); canonTests[12][1] = str("A\\u030a"); canonTests[12][2] = str("\\u00c5"); // angstrom_sign |
| 108 | |
| 109 | canonTests[13][0] = str("\\u00c5"); canonTests[13][1] = str("A\\u030a"); canonTests[13][2] = str("\\u00c5"); // A-ring |
| 110 | |
| 111 | canonTests[14][0] = str("\\u00C4ffin"); canonTests[14][1] = str("A\\u0308ffin"); canonTests[14][2] = str("\\u00C4ffin"); |
| 112 | |
| 113 | canonTests[15][0] = str("\\u00C4\\uFB03n"); canonTests[15][1] = str("A\\u0308\\uFB03n"); canonTests[15][2] = str("\\u00C4\\uFB03n"); |
| 114 | |
| 115 | canonTests[16][0] = str("Henry IV"); canonTests[16][1] = str("Henry IV"); canonTests[16][2] = str("Henry IV"); |
| 116 | |
| 117 | canonTests[17][0] = str("Henry \\u2163"); canonTests[17][1] = str("Henry \\u2163"); canonTests[17][2] = str("Henry \\u2163"); |
| 118 | |
| 119 | canonTests[18][0] = str("\\u30AC"); canonTests[18][1] = str("\\u30AB\\u3099"); canonTests[18][2] = str("\\u30AC"); // ga (Katakana) |
| 120 | |
| 121 | canonTests[19][0] = str("\\u30AB\\u3099"); canonTests[19][1] = str("\\u30AB\\u3099"); canonTests[19][2] = str("\\u30AC"); // ka + ten |
| 122 | |
| 123 | canonTests[20][0] = str("\\uFF76\\uFF9E"); canonTests[20][1] = str("\\uFF76\\uFF9E"); canonTests[20][2] = str("\\uFF76\\uFF9E"); // hw_ka + hw_ten |
| 124 | |
| 125 | canonTests[21][0] = str("\\u30AB\\uFF9E"); canonTests[21][1] = str("\\u30AB\\uFF9E"); canonTests[21][2] = str("\\u30AB\\uFF9E"); // ka + hw_ten |
| 126 | |
| 127 | canonTests[22][0] = str("\\uFF76\\u3099"); canonTests[22][1] = str("\\uFF76\\u3099"); canonTests[22][2] = str("\\uFF76\\u3099"); // hw_ka + ten |
| 128 | |
| 129 | canonTests[23][0] = str("A\\u0300\\u0316"); canonTests[23][1] = str("A\\u0316\\u0300"); canonTests[23][2] = str("\\u00C0\\u0316"); |
| 130 | |
| 131 | /* compatTest */ |
| 132 | // Input Decomposed Composed |
| 133 | compatTests[0][0] = str("cat"); compatTests[0][1] = str("cat"); compatTests[0][2] = str("cat") ; |
| 134 | |
| 135 | compatTests[1][0] = str("\\uFB4f"); compatTests[1][1] = str("\\u05D0\\u05DC"); compatTests[1][2] = str("\\u05D0\\u05DC"); // Alef-Lamed vs. Alef, Lamed |
| 136 | |
| 137 | compatTests[2][0] = str("\\u00C4ffin"); compatTests[2][1] = str("A\\u0308ffin"); compatTests[2][2] = str("\\u00C4ffin") ; |
| 138 | |
| 139 | compatTests[3][0] = str("\\u00C4\\uFB03n"); compatTests[3][1] = str("A\\u0308ffin"); compatTests[3][2] = str("\\u00C4ffin") ; // ffi ligature -> f + f + i |
| 140 | |
| 141 | compatTests[4][0] = str("Henry IV"); compatTests[4][1] = str("Henry IV"); compatTests[4][2] = str("Henry IV") ; |
| 142 | |
| 143 | compatTests[5][0] = str("Henry \\u2163"); compatTests[5][1] = str("Henry IV"); compatTests[5][2] = str("Henry IV") ; |
| 144 | |
| 145 | compatTests[6][0] = str("\\u30AC"); compatTests[6][1] = str("\\u30AB\\u3099"); compatTests[6][2] = str("\\u30AC") ; // ga (Katakana) |
| 146 | |
| 147 | compatTests[7][0] = str("\\u30AB\\u3099"); compatTests[7][1] = str("\\u30AB\\u3099"); compatTests[7][2] = str("\\u30AC") ; // ka + ten |
| 148 | |
| 149 | compatTests[8][0] = str("\\uFF76\\u3099"); compatTests[8][1] = str("\\u30AB\\u3099"); compatTests[8][2] = str("\\u30AC") ; // hw_ka + ten |
| 150 | |
| 151 | /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later */ |
| 152 | compatTests[9][0] = str("\\uFF76\\uFF9E"); compatTests[9][1] = str("\\u30AB\\u3099"); compatTests[9][2] = str("\\u30AC") ; // hw_ka + hw_ten |
| 153 | |
| 154 | compatTests[10][0] = str("\\u30AB\\uFF9E"); compatTests[10][1] = str("\\u30AB\\u3099"); compatTests[10][2] = str("\\u30AC") ; // ka + hw_ten |
| 155 | |
| 156 | /* Hangul Canonical */ |
| 157 | // Input Decomposed Composed |
| 158 | hangulCanon[0][0] = str("\\ud4db"); hangulCanon[0][1] = str("\\u1111\\u1171\\u11b6"); hangulCanon[0][2] = str("\\ud4db") ; |
| 159 | |
| 160 | hangulCanon[1][0] = str("\\u1111\\u1171\\u11b6"), hangulCanon[1][1] = str("\\u1111\\u1171\\u11b6"), hangulCanon[1][2] = str("\\ud4db"); |
| 161 | } |
| 162 | |
| 163 | BasicNormalizerTest::~BasicNormalizerTest() |
| 164 | { |
| 165 | } |
| 166 | |
| 167 | void BasicNormalizerTest::TestPrevious() |
| 168 | { |
| 169 | Normalizer* norm = new Normalizer("", UNORM_NFD); |
| 170 | |
| 171 | logln("testing decomp..."); |
| 172 | uint32_t i; |
| 173 | for (i = 0; i < ARRAY_LENGTH(canonTests); i++) { |
| 174 | backAndForth(norm, canonTests[i][0]); |
| 175 | } |
| 176 | |
| 177 | logln("testing compose..."); |
| 178 | norm->setMode(UNORM_NFC); |
| 179 | for (i = 0; i < ARRAY_LENGTH(canonTests); i++) { |
| 180 | backAndForth(norm, canonTests[i][0]); |
| 181 | } |
| 182 | |
| 183 | delete norm; |
| 184 | } |
| 185 | |
| 186 | void BasicNormalizerTest::TestDecomp() |
| 187 | { |
| 188 | Normalizer* norm = new Normalizer("", UNORM_NFD); |
| 189 | iterateTest(norm, canonTests, ARRAY_LENGTH(canonTests), 1); |
| 190 | staticTest(UNORM_NFD, 0, canonTests, ARRAY_LENGTH(canonTests), 1); |
| 191 | delete norm; |
| 192 | } |
| 193 | |
| 194 | void BasicNormalizerTest::TestCompatDecomp() |
| 195 | { |
| 196 | Normalizer* norm = new Normalizer("", UNORM_NFKD); |
| 197 | iterateTest(norm, compatTests, ARRAY_LENGTH(compatTests), 1); |
| 198 | |
| 199 | staticTest(UNORM_NFKD, 0, |
| 200 | compatTests, ARRAY_LENGTH(compatTests), 1); |
| 201 | delete norm; |
| 202 | } |
| 203 | |
| 204 | void BasicNormalizerTest::TestCanonCompose() |
| 205 | { |
| 206 | Normalizer* norm = new Normalizer("", UNORM_NFC); |
| 207 | iterateTest(norm, canonTests, ARRAY_LENGTH(canonTests), 2); |
| 208 | |
| 209 | staticTest(UNORM_NFC, 0, canonTests, |
| 210 | ARRAY_LENGTH(canonTests), 2); |
| 211 | delete norm; |
| 212 | } |
| 213 | |
| 214 | void BasicNormalizerTest::TestCompatCompose() |
| 215 | { |
| 216 | Normalizer* norm = new Normalizer("", UNORM_NFKC); |
| 217 | iterateTest(norm, compatTests, ARRAY_LENGTH(compatTests), 2); |
| 218 | |
| 219 | staticTest(UNORM_NFKC, 0, |
| 220 | compatTests, ARRAY_LENGTH(compatTests), 2); |
| 221 | delete norm; |
| 222 | } |
| 223 | |
| 224 | |
| 225 | //------------------------------------------------------------------------------- |
| 226 | |
| 227 | void BasicNormalizerTest::TestHangulCompose() |
| 228 | { |
| 229 | // Make sure that the static composition methods work |
| 230 | logln("Canonical composition..."); |
| 231 | staticTest(UNORM_NFC, 0, hangulCanon, ARRAY_LENGTH(hangulCanon), 2); |
| 232 | logln("Compatibility composition..."); |
| 233 | |
| 234 | // Now try iterative composition.... |
| 235 | logln("Static composition..."); |
| 236 | Normalizer* norm = new Normalizer("", UNORM_NFC); |
| 237 | iterateTest(norm, hangulCanon, ARRAY_LENGTH(hangulCanon), 2); |
| 238 | norm->setMode(UNORM_NFKC); |
| 239 | |
| 240 | // And finally, make sure you can do it in reverse too |
| 241 | logln("Reverse iteration..."); |
| 242 | norm->setMode(UNORM_NFC); |
| 243 | for (uint32_t i = 0; i < ARRAY_LENGTH(hangulCanon); i++) { |
| 244 | backAndForth(norm, hangulCanon[i][0]); |
| 245 | } |
| 246 | delete norm; |
| 247 | } |
| 248 | |
| 249 | void BasicNormalizerTest::TestHangulDecomp() |
| 250 | { |
| 251 | // Make sure that the static decomposition methods work |
| 252 | logln("Canonical decomposition..."); |
| 253 | staticTest(UNORM_NFD, 0, hangulCanon, ARRAY_LENGTH(hangulCanon), 1); |
| 254 | logln("Compatibility decomposition..."); |
| 255 | |
| 256 | // Now the iterative decomposition methods... |
| 257 | logln("Iterative decomposition..."); |
| 258 | Normalizer* norm = new Normalizer("", UNORM_NFD); |
| 259 | iterateTest(norm, hangulCanon, ARRAY_LENGTH(hangulCanon), 1); |
| 260 | norm->setMode(UNORM_NFKD); |
| 261 | |
| 262 | // And finally, make sure you can do it in reverse too |
| 263 | logln("Reverse iteration..."); |
| 264 | norm->setMode(UNORM_NFD); |
| 265 | for (uint32_t i = 0; i < ARRAY_LENGTH(hangulCanon); i++) { |
| 266 | backAndForth(norm, hangulCanon[i][0]); |
| 267 | } |
| 268 | delete norm; |
| 269 | } |
| 270 | |
| 271 | /** |
| 272 | * The Tibetan vowel sign AA, 0f71, was messed up prior to Unicode version 2.1.9. |
| 273 | */ |
| 274 | void BasicNormalizerTest::TestTibetan(void) { |
| 275 | UnicodeString decomp[1][3]; |
| 276 | decomp[0][0] = str("\\u0f77"); |
| 277 | decomp[0][1] = str("\\u0f77"); |
| 278 | decomp[0][2] = str("\\u0fb2\\u0f71\\u0f80"); |
| 279 | |
| 280 | UnicodeString compose[1][3]; |
| 281 | compose[0][0] = str("\\u0fb2\\u0f71\\u0f80"); |
| 282 | compose[0][1] = str("\\u0fb2\\u0f71\\u0f80"); |
| 283 | compose[0][2] = str("\\u0fb2\\u0f71\\u0f80"); |
| 284 | |
| 285 | staticTest(UNORM_NFD, 0, decomp, ARRAY_LENGTH(decomp), 1); |
| 286 | staticTest(UNORM_NFKD, 0, decomp, ARRAY_LENGTH(decomp), 2); |
| 287 | staticTest(UNORM_NFC, 0, compose, ARRAY_LENGTH(compose), 1); |
| 288 | staticTest(UNORM_NFKC, 0, compose, ARRAY_LENGTH(compose), 2); |
| 289 | } |
| 290 | |
| 291 | /** |
| 292 | * Make sure characters in the CompositionExclusion.txt list do not get |
| 293 | * composed to. |
| 294 | */ |
| 295 | void BasicNormalizerTest::TestCompositionExclusion(void) { |
| 296 | // This list is generated from CompositionExclusion.txt. |
| 297 | // Update whenever the normalizer tables are updated. Note |
| 298 | // that we test all characters listed, even those that can be |
| 299 | // derived from the Unicode DB and are therefore commented |
| 300 | // out. |
| 301 | // ### TODO read composition exclusion from source/data/unidata file |
| 302 | // and test against that |
| 303 | UnicodeString EXCLUDED = str( |
| 304 | "\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958" |
| 305 | "\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC" |
| 306 | "\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E" |
| 307 | "\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69" |
| 308 | "\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2" |
| 309 | "\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79" |
| 310 | "\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u1FDB" |
| 311 | "\\u1FE3\\u1FEB\\u1FEE\\u1FEF\\u1FF9\\u1FFB\\u1FFD\\u2000" |
| 312 | "\\u2001\\u2126\\u212A\\u212B\\u2329\\u232A\\uF900\\uFA10" |
| 313 | "\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A\\uFB1F" |
| 314 | "\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E\\uFB2F\\uFB30\\uFB31" |
| 315 | "\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36\\uFB38\\uFB39\\uFB3A" |
| 316 | "\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46" |
| 317 | "\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B\\uFB4C\\uFB4D\\uFB4E" |
| 318 | ); |
| 319 | UErrorCode status = U_ZERO_ERROR; |
| 320 | for (int32_t i=0; i<EXCLUDED.length(); ++i) { |
| 321 | UnicodeString a(EXCLUDED.charAt(i)); |
| 322 | UnicodeString b; |
| 323 | UnicodeString c; |
| 324 | Normalizer::normalize(a, UNORM_NFKD, 0, b, status); |
| 325 | Normalizer::normalize(b, UNORM_NFC, 0, c, status); |
| 326 | if (c == a) { |
| 327 | errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + |
| 328 | hex(b) + " x COMPOSE => " + |
| 329 | hex(c)); |
| 330 | } else if (verbose) { |
| 331 | logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + |
| 332 | hex(b) + " x COMPOSE => " + |
| 333 | hex(c)); |
| 334 | } |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | /** |
| 339 | * Test for a problem that showed up just before ICU 1.6 release |
| 340 | * having to do with combining characters with an index of zero. |
| 341 | * Such characters do not participate in any canonical |
| 342 | * decompositions. However, having an index of zero means that |
| 343 | * they all share one typeMask[] entry, that is, they all have to |
| 344 | * map to the same canonical class, which is not the case, in |
| 345 | * reality. |
| 346 | */ |
| 347 | void BasicNormalizerTest::TestZeroIndex(void) { |
| 348 | const char* DATA[] = { |
| 349 | // Expect col1 x COMPOSE_COMPAT => col2 |
| 350 | // Expect col2 x DECOMP => col3 |
| 351 | "A\\u0316\\u0300", "\\u00C0\\u0316", "A\\u0316\\u0300", |
| 352 | "A\\u0300\\u0316", "\\u00C0\\u0316", "A\\u0316\\u0300", |
| 353 | "A\\u0327\\u0300", "\\u00C0\\u0327", "A\\u0327\\u0300", |
| 354 | "c\\u0321\\u0327", "c\\u0321\\u0327", "c\\u0321\\u0327", |
| 355 | "c\\u0327\\u0321", "\\u00E7\\u0321", "c\\u0327\\u0321", |
| 356 | }; |
| 357 | int32_t DATA_length = UPRV_LENGTHOF(DATA); |
| 358 | |
| 359 | for (int32_t i=0; i<DATA_length; i+=3) { |
| 360 | UErrorCode status = U_ZERO_ERROR; |
| 361 | UnicodeString a(DATA[i], ""); |
| 362 | a = a.unescape(); |
| 363 | UnicodeString b; |
| 364 | Normalizer::normalize(a, UNORM_NFKC, 0, b, status); |
| 365 | if (U_FAILURE(status)) { |
| 366 | dataerrln("Error calling normalize UNORM_NFKC: %s", u_errorName(status)); |
| 367 | } else { |
| 368 | UnicodeString exp(DATA[i+1], ""); |
| 369 | exp = exp.unescape(); |
| 370 | if (b == exp) { |
| 371 | logln((UnicodeString)"Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b)); |
| 372 | } else { |
| 373 | errln((UnicodeString)"FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) + |
| 374 | ", expect " + hex(exp)); |
| 375 | } |
| 376 | } |
| 377 | Normalizer::normalize(b, UNORM_NFD, 0, a, status); |
| 378 | if (U_FAILURE(status)) { |
| 379 | dataerrln("Error calling normalize UNORM_NFD: %s", u_errorName(status)); |
| 380 | } else { |
| 381 | UnicodeString exp = UnicodeString(DATA[i+2], "").unescape(); |
| 382 | if (a == exp) { |
| 383 | logln((UnicodeString)"Ok: " + hex(b) + " x DECOMP => " + hex(a)); |
| 384 | } else { |
| 385 | errln((UnicodeString)"FAIL: " + hex(b) + " x DECOMP => " + hex(a) + |
| 386 | ", expect " + hex(exp)); |
| 387 | } |
| 388 | } |
| 389 | } |
| 390 | } |
| 391 | |
| 392 | /** |
| 393 | * Run a few specific cases that are failing for Verisign. |
| 394 | */ |
| 395 | void BasicNormalizerTest::TestVerisign(void) { |
| 396 | /* |
| 397 | > Their input: |
| 398 | > 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F |
| 399 | > Their output (supposedly from ICU): |
| 400 | > 05B8 05B1 05B9 0591 05C3 05B0 05AC 059F |
| 401 | > My output from charlint: |
| 402 | > 05B1 05B8 05B9 0591 05C3 05B0 05AC 059F |
| 403 | |
| 404 | 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F => 05B1 05B8 05B9 0591 05C3 05B0 |
| 405 | 05AC 059F |
| 406 | |
| 407 | U+05B8 18 E HEBREW POINT QAMATS |
| 408 | U+05B9 19 F HEBREW POINT HOLAM |
| 409 | U+05B1 11 HEBREW POINT HATAF SEGOL |
| 410 | U+0591 220 HEBREW ACCENT ETNAHTA |
| 411 | U+05C3 0 HEBREW PUNCTUATION SOF PASUQ |
| 412 | U+05B0 10 HEBREW POINT SHEVA |
| 413 | U+05AC 230 HEBREW ACCENT ILUY |
| 414 | U+059F 230 HEBREW ACCENT QARNEY PARA |
| 415 | |
| 416 | U+05B1 11 HEBREW POINT HATAF SEGOL |
| 417 | U+05B8 18 HEBREW POINT QAMATS |
| 418 | U+05B9 19 HEBREW POINT HOLAM |
| 419 | U+0591 220 HEBREW ACCENT ETNAHTA |
| 420 | U+05C3 0 HEBREW PUNCTUATION SOF PASUQ |
| 421 | U+05B0 10 HEBREW POINT SHEVA |
| 422 | U+05AC 230 HEBREW ACCENT ILUY |
| 423 | U+059F 230 HEBREW ACCENT QARNEY PARA |
| 424 | |
| 425 | Wrong result: |
| 426 | U+05B8 18 HEBREW POINT QAMATS |
| 427 | U+05B1 11 HEBREW POINT HATAF SEGOL |
| 428 | U+05B9 19 HEBREW POINT HOLAM |
| 429 | U+0591 220 HEBREW ACCENT ETNAHTA |
| 430 | U+05C3 0 HEBREW PUNCTUATION SOF PASUQ |
| 431 | U+05B0 10 HEBREW POINT SHEVA |
| 432 | U+05AC 230 HEBREW ACCENT ILUY |
| 433 | U+059F 230 HEBREW ACCENT QARNEY PARA |
| 434 | |
| 435 | |
| 436 | > Their input: |
| 437 | >0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD |
| 438 | >Their output (supposedly from ICU): |
| 439 | >0592 05B0 05B7 05BC 05A5 05C0 05AD 05C4 |
| 440 | >My output from charlint: |
| 441 | >05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4 |
| 442 | |
| 443 | 0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD => 05B0 05B7 05BC 05A5 0592 05C0 |
| 444 | 05AD 05C4 |
| 445 | |
| 446 | U+0592 230 HEBREW ACCENT SEGOL |
| 447 | U+05B7 17 HEBREW POINT PATAH |
| 448 | U+05BC 21 HEBREW POINT DAGESH OR MAPIQ |
| 449 | U+05A5 220 HEBREW ACCENT MERKHA |
| 450 | U+05B0 10 HEBREW POINT SHEVA |
| 451 | U+05C0 0 HEBREW PUNCTUATION PASEQ |
| 452 | U+05C4 230 HEBREW MARK UPPER DOT |
| 453 | U+05AD 222 HEBREW ACCENT DEHI |
| 454 | |
| 455 | U+05B0 10 HEBREW POINT SHEVA |
| 456 | U+05B7 17 HEBREW POINT PATAH |
| 457 | U+05BC 21 HEBREW POINT DAGESH OR MAPIQ |
| 458 | U+05A5 220 HEBREW ACCENT MERKHA |
| 459 | U+0592 230 HEBREW ACCENT SEGOL |
| 460 | U+05C0 0 HEBREW PUNCTUATION PASEQ |
| 461 | U+05AD 222 HEBREW ACCENT DEHI |
| 462 | U+05C4 230 HEBREW MARK UPPER DOT |
| 463 | |
| 464 | Wrong result: |
| 465 | U+0592 230 HEBREW ACCENT SEGOL |
| 466 | U+05B0 10 HEBREW POINT SHEVA |
| 467 | U+05B7 17 HEBREW POINT PATAH |
| 468 | U+05BC 21 HEBREW POINT DAGESH OR MAPIQ |
| 469 | U+05A5 220 HEBREW ACCENT MERKHA |
| 470 | U+05C0 0 HEBREW PUNCTUATION PASEQ |
| 471 | U+05AD 222 HEBREW ACCENT DEHI |
| 472 | U+05C4 230 HEBREW MARK UPPER DOT |
| 473 | */ |
| 474 | UnicodeString data[2][3]; |
| 475 | data[0][0] = str("\\u05B8\\u05B9\\u05B1\\u0591\\u05C3\\u05B0\\u05AC\\u059F"); |
| 476 | data[0][1] = str("\\u05B1\\u05B8\\u05B9\\u0591\\u05C3\\u05B0\\u05AC\\u059F"); |
| 477 | data[0][2] = str(""); |
| 478 | data[1][0] = str("\\u0592\\u05B7\\u05BC\\u05A5\\u05B0\\u05C0\\u05C4\\u05AD"); |
| 479 | data[1][1] = str("\\u05B0\\u05B7\\u05BC\\u05A5\\u0592\\u05C0\\u05AD\\u05C4"); |
| 480 | data[1][2] = str(""); |
| 481 | |
| 482 | staticTest(UNORM_NFD, 0, data, ARRAY_LENGTH(data), 1); |
| 483 | staticTest(UNORM_NFC, 0, data, ARRAY_LENGTH(data), 1); |
| 484 | } |
| 485 | |
| 486 | //------------------------------------------------------------------------ |
| 487 | // Internal utilities |
| 488 | // |
| 489 | |
| 490 | UnicodeString BasicNormalizerTest::hex(UChar ch) { |
| 491 | UnicodeString result; |
| 492 | return appendHex(ch, 4, result); |
| 493 | } |
| 494 | |
| 495 | UnicodeString BasicNormalizerTest::hex(const UnicodeString& s) { |
| 496 | UnicodeString result; |
| 497 | for (int i = 0; i < s.length(); ++i) { |
| 498 | if (i != 0) result += (UChar)0x2c/*,*/; |
| 499 | appendHex(s[i], 4, result); |
| 500 | } |
| 501 | return result; |
| 502 | } |
| 503 | |
| 504 | |
| 505 | inline static void insert(UnicodeString& dest, int pos, UChar32 ch) |
| 506 | { |
| 507 | dest.replace(pos, 0, ch); |
| 508 | } |
| 509 | |
| 510 | void BasicNormalizerTest::backAndForth(Normalizer* iter, const UnicodeString& input) |
| 511 | { |
| 512 | UChar32 ch; |
| 513 | UErrorCode status = U_ZERO_ERROR; |
| 514 | iter->setText(input, status); |
| 515 | |
| 516 | // Run through the iterator forwards and stick it into a StringBuffer |
| 517 | UnicodeString forward; |
| 518 | for (ch = iter->first(); ch != iter->DONE; ch = iter->next()) { |
| 519 | forward += ch; |
| 520 | } |
| 521 | |
| 522 | // Now do it backwards |
| 523 | UnicodeString reverse; |
| 524 | for (ch = iter->last(); ch != iter->DONE; ch = iter->previous()) { |
| 525 | insert(reverse, 0, ch); |
| 526 | } |
| 527 | |
| 528 | if (forward != reverse) { |
| 529 | errln("Forward/reverse mismatch for input " + hex(input) |
| 530 | + ", forward: " + hex(forward) + ", backward: " + hex(reverse)); |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | void BasicNormalizerTest::staticTest(UNormalizationMode mode, int options, |
| 535 | UnicodeString tests[][3], int length, |
| 536 | int outCol) |
| 537 | { |
| 538 | UErrorCode status = U_ZERO_ERROR; |
| 539 | for (int i = 0; i < length; i++) |
| 540 | { |
| 541 | UnicodeString& input = tests[i][0]; |
| 542 | UnicodeString& expect = tests[i][outCol]; |
| 543 | |
| 544 | logln("Normalizing '" + input + "' (" + hex(input) + ")" ); |
| 545 | |
| 546 | UnicodeString output; |
| 547 | Normalizer::normalize(input, mode, options, output, status); |
| 548 | |
| 549 | if (output != expect) { |
| 550 | dataerrln(UnicodeString("ERROR: case ") + i + " normalized " + hex(input) + "\n" |
| 551 | + " expected " + hex(expect) + "\n" |
| 552 | + " static got " + hex(output) ); |
| 553 | } |
| 554 | } |
| 555 | } |
| 556 | |
| 557 | void BasicNormalizerTest::iterateTest(Normalizer* iter, |
| 558 | UnicodeString tests[][3], int length, |
| 559 | int outCol) |
| 560 | { |
| 561 | UErrorCode status = U_ZERO_ERROR; |
| 562 | for (int i = 0; i < length; i++) |
| 563 | { |
| 564 | UnicodeString& input = tests[i][0]; |
| 565 | UnicodeString& expect = tests[i][outCol]; |
| 566 | |
| 567 | logln("Normalizing '" + input + "' (" + hex(input) + ")" ); |
| 568 | |
| 569 | iter->setText(input, status); |
| 570 | assertEqual(input, expect, iter, UnicodeString("ERROR: case ") + i + " "); |
| 571 | } |
| 572 | } |
| 573 | |
| 574 | void BasicNormalizerTest::assertEqual(const UnicodeString& input, |
| 575 | const UnicodeString& expected, |
| 576 | Normalizer* iter, |
| 577 | const UnicodeString& errPrefix) |
| 578 | { |
| 579 | UnicodeString result; |
| 580 | |
| 581 | for (UChar32 ch = iter->first(); ch != iter->DONE; ch = iter->next()) { |
| 582 | result += ch; |
| 583 | } |
| 584 | if (result != expected) { |
| 585 | dataerrln(errPrefix + "normalized " + hex(input) + "\n" |
| 586 | + " expected " + hex(expected) + "\n" |
| 587 | + " iterate got " + hex(result) ); |
| 588 | } |
| 589 | } |
| 590 | |
| 591 | // helper class for TestPreviousNext() |
| 592 | // simple UTF-32 character iterator |
| 593 | class UChar32Iterator { |
| 594 | public: |
| 595 | UChar32Iterator(const UChar32 *text, int32_t len, int32_t index) : |
| 596 | s(text), length(len), i(index) {} |
| 597 | |
| 598 | UChar32 current() { |
| 599 | if(i<length) { |
| 600 | return s[i]; |
| 601 | } else { |
| 602 | return 0xffff; |
| 603 | } |
| 604 | } |
| 605 | |
| 606 | UChar32 next() { |
| 607 | if(i<length) { |
| 608 | return s[i++]; |
| 609 | } else { |
| 610 | return 0xffff; |
| 611 | } |
| 612 | } |
| 613 | |
| 614 | UChar32 previous() { |
| 615 | if(i>0) { |
| 616 | return s[--i]; |
| 617 | } else { |
| 618 | return 0xffff; |
| 619 | } |
| 620 | } |
| 621 | |
| 622 | int32_t getIndex() { |
| 623 | return i; |
| 624 | } |
| 625 | private: |
| 626 | const UChar32 *s; |
| 627 | int32_t length, i; |
| 628 | }; |
| 629 | |
| 630 | void |
| 631 | BasicNormalizerTest::TestPreviousNext(const UChar *src, int32_t srcLength, |
| 632 | const UChar32 *expect, int32_t expectLength, |
| 633 | const int32_t *expectIndex, // its length=expectLength+1 |
| 634 | int32_t srcMiddle, int32_t expectMiddle, |
| 635 | const char *moves, |
| 636 | UNormalizationMode mode, |
| 637 | const char *name) { |
| 638 | // Sanity check non-iterative normalization. |
| 639 | { |
| 640 | IcuTestErrorCode errorCode(*this, "TestPreviousNext"); |
| 641 | UnicodeString result; |
| 642 | Normalizer::normalize(UnicodeString(src, srcLength), mode, 0, result, errorCode); |
| 643 | if (errorCode.isFailure()) { |
| 644 | dataerrln("error: non-iterative normalization of %s failed: %s", |
| 645 | name, errorCode.errorName()); |
| 646 | errorCode.reset(); |
| 647 | return; |
| 648 | } |
| 649 | // UnicodeString::fromUTF32(expect, expectLength) |
| 650 | // would turn unpaired surrogates into U+FFFD. |
| 651 | for (int32_t i = 0, j = 0; i < result.length(); ++j) { |
| 652 | UChar32 c = result.char32At(i); |
| 653 | if (c != expect[j]) { |
| 654 | errln("error: non-iterative normalization of %s did not yield the expected result", |
| 655 | name); |
| 656 | } |
| 657 | i += U16_LENGTH(c); |
| 658 | } |
| 659 | } |
| 660 | |
| 661 | // iterators |
| 662 | Normalizer iter(src, srcLength, mode); |
| 663 | |
| 664 | // test getStaticClassID and getDynamicClassID |
| 665 | if(iter.getDynamicClassID() != Normalizer::getStaticClassID()) { |
| 666 | errln("getStaticClassID != getDynamicClassID for Normalizer."); |
| 667 | } |
| 668 | |
| 669 | UChar32Iterator iter32(expect, expectLength, expectMiddle); |
| 670 | |
| 671 | UChar32 c1, c2; |
| 672 | char m; |
| 673 | |
| 674 | // initially set the indexes into the middle of the strings |
| 675 | iter.setIndexOnly(srcMiddle); |
| 676 | |
| 677 | // move around and compare the iteration code points with |
| 678 | // the expected ones |
| 679 | const char *move=moves; |
| 680 | while((m=*move++)!=0) { |
| 681 | if(m=='-') { |
| 682 | c1=iter.previous(); |
| 683 | c2=iter32.previous(); |
| 684 | } else if(m=='0') { |
| 685 | c1=iter.current(); |
| 686 | c2=iter32.current(); |
| 687 | } else /* m=='+' */ { |
| 688 | c1=iter.next(); |
| 689 | c2=iter32.next(); |
| 690 | } |
| 691 | |
| 692 | // compare results |
| 693 | if(c1!=c2) { |
| 694 | // copy the moves until the current (m) move, and terminate |
| 695 | char history[64]; |
| 696 | uprv_strcpy(history, moves); |
| 697 | history[move-moves]=0; |
| 698 | dataerrln("error: mismatch in Normalizer iteration (%s) at %s: " |
| 699 | "got c1=U+%04lx != expected c2=U+%04lx", |
| 700 | name, history, c1, c2); |
| 701 | break; |
| 702 | } |
| 703 | |
| 704 | // compare indexes |
| 705 | if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { |
| 706 | // copy the moves until the current (m) move, and terminate |
| 707 | char history[64]; |
| 708 | uprv_strcpy(history, moves); |
| 709 | history[move-moves]=0; |
| 710 | errln("error: index mismatch in Normalizer iteration (%s) at %s: " |
| 711 | "Normalizer index %ld expected %ld\n", |
| 712 | name, history, iter.getIndex(), expectIndex[iter32.getIndex()]); |
| 713 | break; |
| 714 | } |
| 715 | } |
| 716 | } |
| 717 | |
| 718 | void |
| 719 | BasicNormalizerTest::TestPreviousNext() { |
| 720 | // src and expect strings |
| 721 | static const UChar src[]={ |
| 722 | U16_LEAD(0x2f999), U16_TRAIL(0x2f999), |
| 723 | U16_LEAD(0x1d15f), U16_TRAIL(0x1d15f), |
| 724 | 0xc4, |
| 725 | 0x1ed0 |
| 726 | }; |
| 727 | static const UChar32 expect[]={ |
| 728 | 0x831d, |
| 729 | 0x1d158, 0x1d165, |
| 730 | 0x41, 0x308, |
| 731 | 0x4f, 0x302, 0x301 |
| 732 | }; |
| 733 | |
| 734 | // expected src indexes corresponding to expect indexes |
| 735 | static const int32_t expectIndex[]={ |
| 736 | 0, |
| 737 | 2, 2, |
| 738 | 4, 4, |
| 739 | 5, 5, 5, |
| 740 | 6 // behind last character |
| 741 | }; |
| 742 | |
| 743 | // src and expect strings for regression test for j2911 |
| 744 | static const UChar src_j2911[]={ |
| 745 | U16_LEAD(0x2f999), U16_TRAIL(0x2f999), |
| 746 | 0xdd00, 0xd900, // unpaired surrogates - regression test for j2911 |
| 747 | 0xc4, |
| 748 | 0x4f, 0x302, 0x301 |
| 749 | }; |
| 750 | static const UChar32 expect_j2911[]={ |
| 751 | 0x831d, |
| 752 | 0xdd00, 0xd900, // unpaired surrogates - regression test for j2911 |
| 753 | 0xc4, |
| 754 | 0x1ed0 |
| 755 | }; |
| 756 | |
| 757 | // expected src indexes corresponding to expect indexes |
| 758 | static const int32_t expectIndex_j2911[]={ |
| 759 | 0, |
| 760 | 2, 3, |
| 761 | 4, |
| 762 | 5, |
| 763 | 8 // behind last character |
| 764 | }; |
| 765 | |
| 766 | // initial indexes into the src and expect strings |
| 767 | // for both sets of test data |
| 768 | enum { |
| 769 | SRC_MIDDLE=4, |
| 770 | EXPECT_MIDDLE=3, |
| 771 | SRC_MIDDLE_2=2, |
| 772 | EXPECT_MIDDLE_2=1 |
| 773 | }; |
| 774 | |
| 775 | // movement vector |
| 776 | // - for previous(), 0 for current(), + for next() |
| 777 | // for both sets of test data |
| 778 | static const char *const moves="0+0+0--0-0-+++0--+++++++0--------"; |
| 779 | |
| 780 | TestPreviousNext(src, UPRV_LENGTHOF(src), |
| 781 | expect, UPRV_LENGTHOF(expect), |
| 782 | expectIndex, |
| 783 | SRC_MIDDLE, EXPECT_MIDDLE, |
| 784 | moves, UNORM_NFD, "basic"); |
| 785 | |
| 786 | TestPreviousNext(src_j2911, UPRV_LENGTHOF(src_j2911), |
| 787 | expect_j2911, UPRV_LENGTHOF(expect_j2911), |
| 788 | expectIndex_j2911, |
| 789 | SRC_MIDDLE, EXPECT_MIDDLE, |
| 790 | moves, UNORM_NFKC, "j2911"); |
| 791 | |
| 792 | // try again from different "middle" indexes |
| 793 | TestPreviousNext(src, UPRV_LENGTHOF(src), |
| 794 | expect, UPRV_LENGTHOF(expect), |
| 795 | expectIndex, |
| 796 | SRC_MIDDLE_2, EXPECT_MIDDLE_2, |
| 797 | moves, UNORM_NFD, "basic_2"); |
| 798 | |
| 799 | TestPreviousNext(src_j2911, UPRV_LENGTHOF(src_j2911), |
| 800 | expect_j2911, UPRV_LENGTHOF(expect_j2911), |
| 801 | expectIndex_j2911, |
| 802 | SRC_MIDDLE_2, EXPECT_MIDDLE_2, |
| 803 | moves, UNORM_NFKC, "j2911_2"); |
| 804 | } |
| 805 | |
| 806 | void BasicNormalizerTest::TestConcatenate() { |
| 807 | static const char *const |
| 808 | cases[][4]={ |
| 809 | /* mode, left, right, result */ |
| 810 | { |
| 811 | "C", |
| 812 | "re", |
| 813 | "\\u0301sum\\u00e9", |
| 814 | "r\\u00e9sum\\u00e9" |
| 815 | }, |
| 816 | { |
| 817 | "C", |
| 818 | "a\\u1100", |
| 819 | "\\u1161bcdefghijk", |
| 820 | "a\\uac00bcdefghijk" |
| 821 | }, |
| 822 | /* ### TODO: add more interesting cases */ |
| 823 | { |
| 824 | "D", |
| 825 | "\\u03B1\\u0345", |
| 826 | "\\u0C4D\\U000110BA\\U0001D169", |
| 827 | "\\u03B1\\U0001D169\\U000110BA\\u0C4D\\u0345" |
| 828 | } |
| 829 | }; |
| 830 | |
| 831 | UnicodeString left, right, expect, result, r; |
| 832 | UErrorCode errorCode; |
| 833 | UNormalizationMode mode; |
| 834 | int32_t i; |
| 835 | |
| 836 | /* test concatenation */ |
| 837 | for(i=0; i<UPRV_LENGTHOF(cases); ++i) { |
| 838 | switch(*cases[i][0]) { |
| 839 | case 'C': mode=UNORM_NFC; break; |
| 840 | case 'D': mode=UNORM_NFD; break; |
| 841 | case 'c': mode=UNORM_NFKC; break; |
| 842 | case 'd': mode=UNORM_NFKD; break; |
| 843 | default: mode=UNORM_NONE; break; |
| 844 | } |
| 845 | |
| 846 | left=UnicodeString(cases[i][1], "").unescape(); |
| 847 | right=UnicodeString(cases[i][2], "").unescape(); |
| 848 | expect=UnicodeString(cases[i][3], "").unescape(); |
| 849 | |
| 850 | //result=r=UnicodeString(); |
| 851 | errorCode=U_ZERO_ERROR; |
| 852 | |
| 853 | r=Normalizer::concatenate(left, right, result, mode, 0, errorCode); |
| 854 | if(U_FAILURE(errorCode) || /*result!=r ||*/ result!=expect) { |
| 855 | dataerrln("error in Normalizer::concatenate(), cases[] fails with "+ |
| 856 | UnicodeString(u_errorName(errorCode))+", result==expect: expected: "+ |
| 857 | hex(expect)+" =========> got: " + hex(result)); |
| 858 | } |
| 859 | } |
| 860 | |
| 861 | /* test error cases */ |
| 862 | |
| 863 | /* left.getBuffer()==result.getBuffer() */ |
| 864 | result=r=expect=UnicodeString("zz", ""); |
| 865 | errorCode=U_UNEXPECTED_TOKEN; |
| 866 | r=Normalizer::concatenate(left, right, result, mode, 0, errorCode); |
| 867 | if(errorCode!=U_UNEXPECTED_TOKEN || result!=r || !result.isBogus()) { |
| 868 | errln("error in Normalizer::concatenate(), violates UErrorCode protocol"); |
| 869 | } |
| 870 | |
| 871 | left.setToBogus(); |
| 872 | errorCode=U_ZERO_ERROR; |
| 873 | r=Normalizer::concatenate(left, right, result, mode, 0, errorCode); |
| 874 | if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || result!=r || !result.isBogus()) { |
| 875 | errln("error in Normalizer::concatenate(), does not detect left.isBogus()"); |
| 876 | } |
| 877 | } |
| 878 | |
| 879 | // reference implementation of Normalizer::compare |
| 880 | static int32_t |
| 881 | ref_norm_compare(const UnicodeString &s1, const UnicodeString &s2, uint32_t options, UErrorCode &errorCode) { |
| 882 | UnicodeString r1, r2, t1, t2; |
| 883 | int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); |
| 884 | |
| 885 | if(options&U_COMPARE_IGNORE_CASE) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 886 | Normalizer::decompose(s1, false, normOptions, r1, errorCode); |
| 887 | Normalizer::decompose(s2, false, normOptions, r2, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 888 | |
| 889 | r1.foldCase(options); |
| 890 | r2.foldCase(options); |
| 891 | } else { |
| 892 | r1=s1; |
| 893 | r2=s2; |
| 894 | } |
| 895 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 896 | Normalizer::decompose(r1, false, normOptions, t1, errorCode); |
| 897 | Normalizer::decompose(r2, false, normOptions, t2, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 898 | |
| 899 | if(options&U_COMPARE_CODE_POINT_ORDER) { |
| 900 | return t1.compareCodePointOrder(t2); |
| 901 | } else { |
| 902 | return t1.compare(t2); |
| 903 | } |
| 904 | } |
| 905 | |
| 906 | // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately |
| 907 | static int32_t |
| 908 | _norm_compare(const UnicodeString &s1, const UnicodeString &s2, uint32_t options, UErrorCode &errorCode) { |
| 909 | int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); |
| 910 | |
| 911 | if( UNORM_YES==Normalizer::quickCheck(s1, UNORM_FCD, normOptions, errorCode) && |
| 912 | UNORM_YES==Normalizer::quickCheck(s2, UNORM_FCD, normOptions, errorCode)) { |
| 913 | options|=UNORM_INPUT_IS_FCD; |
| 914 | } |
| 915 | |
| 916 | return Normalizer::compare(s1, s2, options, errorCode); |
| 917 | } |
| 918 | |
| 919 | // reference implementation of UnicodeString::caseCompare |
| 920 | static int32_t |
| 921 | ref_case_compare(const UnicodeString &s1, const UnicodeString &s2, uint32_t options) { |
| 922 | UnicodeString t1, t2; |
| 923 | |
| 924 | t1=s1; |
| 925 | t2=s2; |
| 926 | |
| 927 | t1.foldCase(options); |
| 928 | t2.foldCase(options); |
| 929 | |
| 930 | if(options&U_COMPARE_CODE_POINT_ORDER) { |
| 931 | return t1.compareCodePointOrder(t2); |
| 932 | } else { |
| 933 | return t1.compare(t2); |
| 934 | } |
| 935 | } |
| 936 | |
| 937 | // reduce an integer to -1/0/1 |
| 938 | static inline int32_t |
| 939 | _sign(int32_t value) { |
| 940 | if(value==0) { |
| 941 | return 0; |
| 942 | } else { |
| 943 | return (value>>31)|1; |
| 944 | } |
| 945 | } |
| 946 | |
| 947 | static const char * |
| 948 | _signString(int32_t value) { |
| 949 | if(value<0) { |
| 950 | return "<0"; |
| 951 | } else if(value==0) { |
| 952 | return "=0"; |
| 953 | } else /* value>0 */ { |
| 954 | return ">0"; |
| 955 | } |
| 956 | } |
| 957 | |
| 958 | void |
| 959 | BasicNormalizerTest::TestCompare() { |
| 960 | // test Normalizer::compare and unorm_compare (thinly wrapped by the former) |
| 961 | // by comparing it with its semantic equivalent |
| 962 | // since we trust the pieces, this is sufficient |
| 963 | |
| 964 | // test each string with itself and each other |
| 965 | // each time with all options |
| 966 | static const char *const |
| 967 | strings[]={ |
| 968 | // some cases from NormalizationTest.txt |
| 969 | // 0..3 |
| 970 | "D\\u031B\\u0307\\u0323", |
| 971 | "\\u1E0C\\u031B\\u0307", |
| 972 | "D\\u031B\\u0323\\u0307", |
| 973 | "d\\u031B\\u0323\\u0307", |
| 974 | |
| 975 | // 4..6 |
| 976 | "\\u00E4", |
| 977 | "a\\u0308", |
| 978 | "A\\u0308", |
| 979 | |
| 980 | // Angstrom sign = A ring |
| 981 | // 7..10 |
| 982 | "\\u212B", |
| 983 | "\\u00C5", |
| 984 | "A\\u030A", |
| 985 | "a\\u030A", |
| 986 | |
| 987 | // 11.14 |
| 988 | "a\\u059A\\u0316\\u302A\\u032Fb", |
| 989 | "a\\u302A\\u0316\\u032F\\u059Ab", |
| 990 | "a\\u302A\\u0316\\u032F\\u059Ab", |
| 991 | "A\\u059A\\u0316\\u302A\\u032Fb", |
| 992 | |
| 993 | // from ICU case folding tests |
| 994 | // 15..20 |
| 995 | "A\\u00df\\u00b5\\ufb03\\U0001040c\\u0131", |
| 996 | "ass\\u03bcffi\\U00010434i", |
| 997 | "\\u0061\\u0042\\u0131\\u03a3\\u00df\\ufb03\\ud93f\\udfff", |
| 998 | "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udfff", |
| 999 | "\\u0041\\u0062\\u0131\\u03c3\\u0053\\u0073\\u0066\\u0046\\u0069\\ud93f\\udfff", |
| 1000 | "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udffd", |
| 1001 | |
| 1002 | // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold |
| 1003 | // vs. U+10000 at bottom - code point order |
| 1004 | // 21..22 |
| 1005 | "\\ud800\\ud800\\udc01", |
| 1006 | "\\ud800\\udc00", |
| 1007 | |
| 1008 | // other code point order tests from ustrtest.cpp |
| 1009 | // 23..31 |
| 1010 | "\\u20ac\\ud801", |
| 1011 | "\\u20ac\\ud800\\udc00", |
| 1012 | "\\ud800", |
| 1013 | "\\ud800\\uff61", |
| 1014 | "\\udfff", |
| 1015 | "\\uff61\\udfff", |
| 1016 | "\\uff61\\ud800\\udc02", |
| 1017 | "\\ud800\\udc02", |
| 1018 | "\\ud84d\\udc56", |
| 1019 | |
| 1020 | // long strings, see cnormtst.c/TestNormCoverage() |
| 1021 | // equivalent if case-insensitive |
| 1022 | // 32..33 |
| 1023 | "\\uAD8B\\uAD8B\\uAD8B\\uAD8B" |
| 1024 | "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1025 | "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1026 | "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1027 | "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1028 | "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1029 | "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" |
| 1030 | "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" |
| 1031 | "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" |
| 1032 | "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" |
| 1033 | "\\uAD8B\\uAD8B\\uAD8B\\uAD8B" |
| 1034 | "d\\u031B\\u0307\\u0323", |
| 1035 | |
| 1036 | "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa" |
| 1037 | "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1038 | "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1039 | "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1040 | "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1041 | "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e" |
| 1042 | "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" |
| 1043 | "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" |
| 1044 | "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" |
| 1045 | "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" |
| 1046 | "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa" |
| 1047 | "\\u1E0C\\u031B\\u0307", |
| 1048 | |
| 1049 | // some strings that may make a difference whether the compare function |
| 1050 | // case-folds or decomposes first |
| 1051 | // 34..41 |
| 1052 | "\\u0360\\u0345\\u0334", |
| 1053 | "\\u0360\\u03b9\\u0334", |
| 1054 | |
| 1055 | "\\u0360\\u1f80\\u0334", |
| 1056 | "\\u0360\\u03b1\\u0313\\u03b9\\u0334", |
| 1057 | |
| 1058 | "\\u0360\\u1ffc\\u0334", |
| 1059 | "\\u0360\\u03c9\\u03b9\\u0334", |
| 1060 | |
| 1061 | "a\\u0360\\u0345\\u0360\\u0345b", |
| 1062 | "a\\u0345\\u0360\\u0345\\u0360b", |
| 1063 | |
| 1064 | // interesting cases for canonical caseless match with turkic i handling |
| 1065 | // 42..43 |
| 1066 | "\\u00cc", |
| 1067 | "\\u0069\\u0300", |
| 1068 | |
| 1069 | // strings with post-Unicode 3.2 normalization or normalization corrections |
| 1070 | // 44..45 |
| 1071 | "\\u00e4\\u193b\\U0002f868", |
| 1072 | "\\u0061\\u193b\\u0308\\u36fc", |
| 1073 | |
| 1074 | // empty string |
| 1075 | // 46 |
| 1076 | "" |
| 1077 | }; |
| 1078 | |
| 1079 | UnicodeString s[100]; // at least as many items as in strings[] ! |
| 1080 | |
| 1081 | // all combinations of options |
| 1082 | // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions |
| 1083 | // set UNORM_UNICODE_3_2 in one additional combination |
| 1084 | static const struct { |
| 1085 | uint32_t options; |
| 1086 | const char *name; |
| 1087 | } opt[]={ |
| 1088 | { 0, "default" }, |
| 1089 | { U_COMPARE_CODE_POINT_ORDER, "c.p. order" }, |
| 1090 | { U_COMPARE_IGNORE_CASE, "ignore case" }, |
| 1091 | { U_COMPARE_CODE_POINT_ORDER|U_COMPARE_IGNORE_CASE, "c.p. order & ignore case" }, |
| 1092 | { U_COMPARE_IGNORE_CASE|U_FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i" }, |
| 1093 | { U_COMPARE_CODE_POINT_ORDER|U_COMPARE_IGNORE_CASE|U_FOLD_CASE_EXCLUDE_SPECIAL_I, "c.p. order & ignore case & special i" }, |
| 1094 | { UNORM_UNICODE_3_2<<UNORM_COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2" } |
| 1095 | }; |
| 1096 | |
| 1097 | int32_t i, j, k, count=UPRV_LENGTHOF(strings); |
| 1098 | int32_t result, refResult; |
| 1099 | |
| 1100 | UErrorCode errorCode; |
| 1101 | |
| 1102 | // create the UnicodeStrings |
| 1103 | for(i=0; i<count; ++i) { |
| 1104 | s[i]=UnicodeString(strings[i], "").unescape(); |
| 1105 | } |
| 1106 | |
| 1107 | // test them each with each other |
| 1108 | for(i=0; i<count; ++i) { |
| 1109 | for(j=i; j<count; ++j) { |
| 1110 | for(k=0; k<UPRV_LENGTHOF(opt); ++k) { |
| 1111 | // test Normalizer::compare |
| 1112 | errorCode=U_ZERO_ERROR; |
| 1113 | result=_norm_compare(s[i], s[j], opt[k].options, errorCode); |
| 1114 | refResult=ref_norm_compare(s[i], s[j], opt[k].options, errorCode); |
| 1115 | if(_sign(result)!=_sign(refResult)) { |
| 1116 | errln("Normalizer::compare(%d, %d, %s)%s should be %s %s", |
| 1117 | i, j, opt[k].name, _signString(result), _signString(refResult), |
| 1118 | U_SUCCESS(errorCode) ? "" : u_errorName(errorCode)); |
| 1119 | } |
| 1120 | |
| 1121 | // test UnicodeString::caseCompare - same internal implementation function |
| 1122 | if(opt[k].options&U_COMPARE_IGNORE_CASE) { |
| 1123 | errorCode=U_ZERO_ERROR; |
| 1124 | result=s[i].caseCompare(s[j], opt[k].options); |
| 1125 | refResult=ref_case_compare(s[i], s[j], opt[k].options); |
| 1126 | if(_sign(result)!=_sign(refResult)) { |
| 1127 | errln("UniStr::caseCompare(%d, %d, %s)%s should be %s %s", |
| 1128 | i, j, opt[k].name, _signString(result), _signString(refResult), |
| 1129 | U_SUCCESS(errorCode) ? "" : u_errorName(errorCode)); |
| 1130 | } |
| 1131 | } |
| 1132 | } |
| 1133 | } |
| 1134 | } |
| 1135 | |
| 1136 | // test cases with i and I to make sure Turkic works |
| 1137 | static const UChar iI[]={ 0x49, 0x69, 0x130, 0x131 }; |
| 1138 | UnicodeSet iSet, set; |
| 1139 | |
| 1140 | UnicodeString s1, s2; |
| 1141 | |
| 1142 | const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); |
| 1143 | if(U_FAILURE(errorCode) || !nfcImpl->ensureCanonIterData(errorCode)) { |
| 1144 | dataerrln("Normalizer2Factory::getNFCImpl().ensureCanonIterData() failed: %s", |
| 1145 | u_errorName(errorCode)); |
| 1146 | return; |
| 1147 | } |
| 1148 | |
| 1149 | // collect all sets into one for contiguous output |
| 1150 | for(i=0; i<UPRV_LENGTHOF(iI); ++i) { |
| 1151 | if(nfcImpl->getCanonStartSet(iI[i], iSet)) { |
| 1152 | set.addAll(iSet); |
| 1153 | } |
| 1154 | } |
| 1155 | |
| 1156 | // test all of these precomposed characters |
| 1157 | const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); |
| 1158 | UnicodeSetIterator it(set); |
| 1159 | while(it.next() && !it.isString()) { |
| 1160 | UChar32 c=it.getCodepoint(); |
| 1161 | if(!nfcNorm2->getDecomposition(c, s2)) { |
| 1162 | dataerrln("NFC.getDecomposition(i-composite U+%04lx) failed", (long)c); |
| 1163 | return; |
| 1164 | } |
| 1165 | |
| 1166 | s1.setTo(c); |
| 1167 | for(k=0; k<UPRV_LENGTHOF(opt); ++k) { |
| 1168 | // test Normalizer::compare |
| 1169 | errorCode=U_ZERO_ERROR; |
| 1170 | result=_norm_compare(s1, s2, opt[k].options, errorCode); |
| 1171 | refResult=ref_norm_compare(s1, s2, opt[k].options, errorCode); |
| 1172 | if(_sign(result)!=_sign(refResult)) { |
| 1173 | errln("Normalizer::compare(U+%04x with its NFD, %s)%s should be %s %s", |
| 1174 | c, opt[k].name, _signString(result), _signString(refResult), |
| 1175 | U_SUCCESS(errorCode) ? "" : u_errorName(errorCode)); |
| 1176 | } |
| 1177 | |
| 1178 | // test UnicodeString::caseCompare - same internal implementation function |
| 1179 | if(opt[k].options&U_COMPARE_IGNORE_CASE) { |
| 1180 | errorCode=U_ZERO_ERROR; |
| 1181 | result=s1.caseCompare(s2, opt[k].options); |
| 1182 | refResult=ref_case_compare(s1, s2, opt[k].options); |
| 1183 | if(_sign(result)!=_sign(refResult)) { |
| 1184 | errln("UniStr::caseCompare(U+%04x with its NFD, %s)%s should be %s %s", |
| 1185 | c, opt[k].name, _signString(result), _signString(refResult), |
| 1186 | U_SUCCESS(errorCode) ? "" : u_errorName(errorCode)); |
| 1187 | } |
| 1188 | } |
| 1189 | } |
| 1190 | } |
| 1191 | |
| 1192 | // test getDecomposition() for some characters that do not decompose |
| 1193 | if( nfcNorm2->getDecomposition(0x20, s2) || |
| 1194 | nfcNorm2->getDecomposition(0x4e00, s2) || |
| 1195 | nfcNorm2->getDecomposition(0x20002, s2) |
| 1196 | ) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1197 | errln("NFC.getDecomposition() returns true for characters which do not have decompositions"); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1198 | } |
| 1199 | |
| 1200 | // test getRawDecomposition() for some characters that do not decompose |
| 1201 | if( nfcNorm2->getRawDecomposition(0x20, s2) || |
| 1202 | nfcNorm2->getRawDecomposition(0x4e00, s2) || |
| 1203 | nfcNorm2->getRawDecomposition(0x20002, s2) |
| 1204 | ) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1205 | errln("NFC.getRawDecomposition() returns true for characters which do not have decompositions"); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1206 | } |
| 1207 | |
| 1208 | // test composePair() for some pairs of characters that do not compose |
| 1209 | if( nfcNorm2->composePair(0x20, 0x301)>=0 || |
| 1210 | nfcNorm2->composePair(0x61, 0x305)>=0 || |
| 1211 | nfcNorm2->composePair(0x1100, 0x1160)>=0 || |
| 1212 | nfcNorm2->composePair(0xac00, 0x11a7)>=0 |
| 1213 | ) { |
| 1214 | errln("NFC.composePair() incorrectly composes some pairs of characters"); |
| 1215 | } |
| 1216 | |
| 1217 | // test FilteredNormalizer2::getDecomposition() |
| 1218 | UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode); |
| 1219 | FilteredNormalizer2 fn2(*nfcNorm2, filter); |
| 1220 | if( fn2.getDecomposition(0xe4, s1) || !fn2.getDecomposition(0x100, s2) || |
| 1221 | s2.length()!=2 || s2[0]!=0x41 || s2[1]!=0x304 |
| 1222 | ) { |
| 1223 | errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed"); |
| 1224 | } |
| 1225 | |
| 1226 | // test FilteredNormalizer2::getRawDecomposition() |
| 1227 | if( fn2.getRawDecomposition(0xe4, s1) || !fn2.getRawDecomposition(0x100, s2) || |
| 1228 | s2.length()!=2 || s2[0]!=0x41 || s2[1]!=0x304 |
| 1229 | ) { |
| 1230 | errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed"); |
| 1231 | } |
| 1232 | |
| 1233 | // test FilteredNormalizer2::composePair() |
| 1234 | if( 0x100!=fn2.composePair(0x41, 0x304) || |
| 1235 | fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08 |
| 1236 | ) { |
| 1237 | errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed"); |
| 1238 | } |
| 1239 | } |
| 1240 | |
| 1241 | // verify that case-folding does not un-FCD strings |
| 1242 | int32_t |
| 1243 | BasicNormalizerTest::countFoldFCDExceptions(uint32_t foldingOptions) { |
| 1244 | UnicodeString s, fold, d; |
| 1245 | UChar32 c; |
| 1246 | int32_t count; |
| 1247 | uint8_t cc, trailCC, foldCC, foldTrailCC; |
| 1248 | UNormalizationCheckResult qcResult; |
| 1249 | int8_t category; |
| 1250 | UBool isNFD; |
| 1251 | UErrorCode errorCode; |
| 1252 | |
| 1253 | logln("Test if case folding may un-FCD a string (folding options %04lx)", foldingOptions); |
| 1254 | |
| 1255 | count=0; |
| 1256 | for(c=0; c<=0x10ffff; ++c) { |
| 1257 | errorCode = U_ZERO_ERROR; |
| 1258 | category=u_charType(c); |
| 1259 | if(category==U_UNASSIGNED) { |
| 1260 | continue; // skip unassigned code points |
| 1261 | } |
| 1262 | if(c==0xac00) { |
| 1263 | c=0xd7a3; // skip Hangul - no case folding there |
| 1264 | continue; |
| 1265 | } |
| 1266 | // skip Han blocks - no case folding there either |
| 1267 | if(c==0x3400) { |
| 1268 | c=0x4db5; |
| 1269 | continue; |
| 1270 | } |
| 1271 | if(c==0x4e00) { |
| 1272 | c=0x9fa5; |
| 1273 | continue; |
| 1274 | } |
| 1275 | if(c==0x20000) { |
| 1276 | c=0x2a6d6; |
| 1277 | continue; |
| 1278 | } |
| 1279 | |
| 1280 | s.setTo(c); |
| 1281 | |
| 1282 | // get leading and trailing cc for c |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1283 | Normalizer::decompose(s, false, 0, d, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1284 | isNFD= s==d; |
| 1285 | cc=u_getCombiningClass(d.char32At(0)); |
| 1286 | trailCC=u_getCombiningClass(d.char32At(d.length()-1)); |
| 1287 | |
| 1288 | // get leading and trailing cc for the case-folding of c |
| 1289 | s.foldCase(foldingOptions); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1290 | Normalizer::decompose(s, false, 0, d, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1291 | foldCC=u_getCombiningClass(d.char32At(0)); |
| 1292 | foldTrailCC=u_getCombiningClass(d.char32At(d.length()-1)); |
| 1293 | |
| 1294 | qcResult=Normalizer::quickCheck(s, UNORM_FCD, errorCode); |
| 1295 | |
| 1296 | if (U_FAILURE(errorCode)) { |
| 1297 | ++count; |
| 1298 | dataerrln("U+%04lx: Failed with error %s", u_errorName(errorCode)); |
| 1299 | } |
| 1300 | |
| 1301 | // bad: |
| 1302 | // - character maps to empty string: adjacent characters may then need reordering |
| 1303 | // - folding has different leading/trailing cc's, and they don't become just 0 |
| 1304 | // - folding itself is not FCD |
| 1305 | if( qcResult!=UNORM_YES || |
| 1306 | s.isEmpty() || |
| 1307 | (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0) |
| 1308 | ) { |
| 1309 | ++count; |
| 1310 | dataerrln("U+%04lx: case-folding may un-FCD a string (folding options %04lx)", c, foldingOptions); |
| 1311 | dataerrln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, d.char32At(0), foldCC, d.char32At(d.length()-1), foldTrailCC, qcResult); |
| 1312 | continue; |
| 1313 | } |
| 1314 | |
| 1315 | // also bad: |
| 1316 | // if a code point is in NFD but its case folding is not, then |
| 1317 | // unorm_compare will also fail |
| 1318 | if(isNFD && UNORM_YES!=Normalizer::quickCheck(s, UNORM_NFD, errorCode)) { |
| 1319 | ++count; |
| 1320 | errln("U+%04lx: case-folding un-NFDs this character (folding options %04lx)", c, foldingOptions); |
| 1321 | } |
| 1322 | } |
| 1323 | |
| 1324 | logln("There are %ld code points for which case-folding may un-FCD a string (folding options %04lx)", count, foldingOptions); |
| 1325 | return count; |
| 1326 | } |
| 1327 | |
| 1328 | void |
| 1329 | BasicNormalizerTest::FindFoldFCDExceptions() { |
| 1330 | int32_t count; |
| 1331 | |
| 1332 | count=countFoldFCDExceptions(0); |
| 1333 | count+=countFoldFCDExceptions(U_FOLD_CASE_EXCLUDE_SPECIAL_I); |
| 1334 | if(count>0) { |
| 1335 | /* |
| 1336 | * If case-folding un-FCDs any strings, then unorm_compare() must be |
| 1337 | * re-implemented. |
| 1338 | * It currently assumes that one can check for FCD then case-fold |
| 1339 | * and then still have FCD strings for raw decomposition without reordering. |
| 1340 | */ |
| 1341 | dataerrln("error: There are %ld code points for which case-folding may un-FCD a string for all folding options.\n" |
| 1342 | "See comment in BasicNormalizerTest::FindFoldFCDExceptions()!", count); |
| 1343 | } |
| 1344 | } |
| 1345 | |
| 1346 | static void |
| 1347 | initExpectedSkippables(UnicodeSet skipSets[UNORM_MODE_COUNT], UErrorCode &errorCode) { |
| 1348 | skipSets[UNORM_NFD].applyPattern( |
| 1349 | UNICODE_STRING_SIMPLE("[[:NFD_QC=Yes:]&[:ccc=0:]]"), errorCode); |
| 1350 | skipSets[UNORM_NFC].applyPattern( |
| 1351 | UNICODE_STRING_SIMPLE("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode); |
| 1352 | skipSets[UNORM_NFKD].applyPattern( |
| 1353 | UNICODE_STRING_SIMPLE("[[:NFKD_QC=Yes:]&[:ccc=0:]]"), errorCode); |
| 1354 | skipSets[UNORM_NFKC].applyPattern( |
| 1355 | UNICODE_STRING_SIMPLE("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode); |
| 1356 | |
| 1357 | // Remove from the NFC and NFKC sets all those characters that change |
| 1358 | // when a back-combining character is added. |
| 1359 | // First, get all of the back-combining characters and their combining classes. |
| 1360 | UnicodeSet combineBack("[:NFC_QC=Maybe:]", errorCode); |
| 1361 | int32_t numCombineBack=combineBack.size(); |
| 1362 | int32_t *combineBackCharsAndCc=new int32_t[numCombineBack*2]; |
| 1363 | UnicodeSetIterator iter(combineBack); |
| 1364 | for(int32_t i=0; i<numCombineBack; ++i) { |
| 1365 | iter.next(); |
| 1366 | UChar32 c=iter.getCodepoint(); |
| 1367 | combineBackCharsAndCc[2*i]=c; |
| 1368 | combineBackCharsAndCc[2*i+1]=u_getCombiningClass(c); |
| 1369 | } |
| 1370 | |
| 1371 | // We need not look at control codes, Han characters nor Hangul LVT syllables because they |
| 1372 | // do not combine forward. LV syllables are already removed. |
| 1373 | UnicodeSet notInteresting("[[:C:][:Unified_Ideograph:][:HST=LVT:]]", errorCode); |
| 1374 | LocalPointer<UnicodeSet> unsure(&(skipSets[UNORM_NFC].clone())->removeAll(notInteresting)); |
| 1375 | // System.out.format("unsure.size()=%d\n", unsure.size()); |
| 1376 | |
| 1377 | // For each character about which we are unsure, see if it changes when we add |
| 1378 | // one of the back-combining characters. |
| 1379 | const Normalizer2 *norm2=Normalizer2::getNFCInstance(errorCode); |
| 1380 | UnicodeString s; |
| 1381 | iter.reset(*unsure); |
| 1382 | while(iter.next()) { |
| 1383 | UChar32 c=iter.getCodepoint(); |
| 1384 | s.setTo(c); |
| 1385 | int32_t cLength=s.length(); |
| 1386 | int32_t tccc=u_getIntPropertyValue(c, UCHAR_TRAIL_CANONICAL_COMBINING_CLASS); |
| 1387 | for(int32_t i=0; i<numCombineBack; ++i) { |
| 1388 | // If c's decomposition ends with a character with non-zero combining class, then |
| 1389 | // c can only change if it combines with a character with a non-zero combining class. |
| 1390 | int32_t cc2=combineBackCharsAndCc[2*i+1]; |
| 1391 | if(tccc==0 || cc2!=0) { |
| 1392 | UChar32 c2=combineBackCharsAndCc[2*i]; |
| 1393 | s.append(c2); |
| 1394 | if(!norm2->isNormalized(s, errorCode)) { |
| 1395 | // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2); |
| 1396 | skipSets[UNORM_NFC].remove(c); |
| 1397 | skipSets[UNORM_NFKC].remove(c); |
| 1398 | break; |
| 1399 | } |
| 1400 | s.truncate(cLength); |
| 1401 | } |
| 1402 | } |
| 1403 | } |
| 1404 | delete [] combineBackCharsAndCc; |
| 1405 | } |
| 1406 | |
| 1407 | static const char *const kModeStrings[UNORM_MODE_COUNT] = { |
| 1408 | "?", "none", "D", "KD", "C", "KC", "FCD" |
| 1409 | }; |
| 1410 | |
| 1411 | void |
| 1412 | BasicNormalizerTest::TestSkippable() { |
| 1413 | UnicodeSet diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT]; |
| 1414 | UnicodeString s, pattern; |
| 1415 | |
| 1416 | /* build NF*Skippable sets from runtime data */ |
| 1417 | IcuTestErrorCode errorCode(*this, "TestSkippable"); |
| 1418 | skipSets[UNORM_NFD].applyPattern(UNICODE_STRING_SIMPLE("[:NFD_Inert:]"), errorCode); |
| 1419 | skipSets[UNORM_NFKD].applyPattern(UNICODE_STRING_SIMPLE("[:NFKD_Inert:]"), errorCode); |
| 1420 | skipSets[UNORM_NFC].applyPattern(UNICODE_STRING_SIMPLE("[:NFC_Inert:]"), errorCode); |
| 1421 | skipSets[UNORM_NFKC].applyPattern(UNICODE_STRING_SIMPLE("[:NFKC_Inert:]"), errorCode); |
| 1422 | if(errorCode.errDataIfFailureAndReset("UnicodeSet(NF..._Inert) failed")) { |
| 1423 | return; |
| 1424 | } |
| 1425 | |
| 1426 | /* get expected sets from hardcoded patterns */ |
| 1427 | initExpectedSkippables(expectSets, errorCode); |
| 1428 | errorCode.assertSuccess(); |
| 1429 | |
| 1430 | for(int32_t i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) { |
| 1431 | if(skipSets[i]!=expectSets[i]) { |
| 1432 | const char *ms=kModeStrings[i]; |
| 1433 | errln("error: TestSkippable skipSets[%s]!=expectedSets[%s]\n", ms, ms); |
| 1434 | // Note: This used to depend on hardcoded UnicodeSet patterns generated by |
| 1435 | // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by |
| 1436 | // running com.ibm.text.UCD.Main with the option NFSkippable. |
| 1437 | // Since ICU 4.6/Unicode 6, we are generating the |
| 1438 | // expectSets ourselves in initSkippables(). |
| 1439 | |
| 1440 | s=UNICODE_STRING_SIMPLE("skip-expect="); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1441 | (diff=skipSets[i]).removeAll(expectSets[i]).toPattern(pattern, true); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1442 | s.append(pattern); |
| 1443 | |
| 1444 | pattern.remove(); |
| 1445 | s.append(UNICODE_STRING_SIMPLE("\n\nexpect-skip=")); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1446 | (diff=expectSets[i]).removeAll(skipSets[i]).toPattern(pattern, true); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1447 | s.append(pattern); |
| 1448 | s.append(UNICODE_STRING_SIMPLE("\n\n")); |
| 1449 | |
| 1450 | errln(s); |
| 1451 | } |
| 1452 | } |
| 1453 | } |
| 1454 | |
| 1455 | struct StringPair { const char *input, *expected; }; |
| 1456 | |
| 1457 | void |
| 1458 | BasicNormalizerTest::TestCustomComp() { |
| 1459 | static const StringPair pairs[]={ |
| 1460 | // ICU 63 normalization with UCPTrie requires inert surrogate code points. |
| 1461 | // { "\\uD801\\uE000\\uDFFE", "" }, |
| 1462 | // { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, |
| 1463 | // { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, |
| 1464 | { "\\uD801\\uE000\\uDFFE", "\\uD801\\uDFFE" }, |
| 1465 | { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD800\\uD801\\uDFFE\\uDFFF" }, |
| 1466 | { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD800\\U000107FE\\uDFFF" }, |
| 1467 | |
| 1468 | { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" }, |
| 1469 | { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, |
| 1470 | { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, |
| 1471 | { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, |
| 1472 | { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } |
| 1473 | }; |
| 1474 | IcuTestErrorCode errorCode(*this, "BasicNormalizerTest/TestCustomComp"); |
| 1475 | const Normalizer2 *customNorm2= |
| 1476 | Normalizer2::getInstance(loadTestData(errorCode), "testnorm", |
| 1477 | UNORM2_COMPOSE, errorCode); |
| 1478 | if(errorCode.errDataIfFailureAndReset("unable to load testdata/testnorm.nrm")) { |
| 1479 | return; |
| 1480 | } |
| 1481 | for(int32_t i=0; i<UPRV_LENGTHOF(pairs); ++i) { |
| 1482 | const StringPair &pair=pairs[i]; |
| 1483 | UnicodeString input=UnicodeString(pair.input, -1, US_INV).unescape(); |
| 1484 | UnicodeString expected=UnicodeString(pair.expected, -1, US_INV).unescape(); |
| 1485 | UnicodeString result=customNorm2->normalize(input, errorCode); |
| 1486 | if(result!=expected) { |
| 1487 | errln("custom compose Normalizer2 did not normalize input %d as expected", i); |
| 1488 | } |
| 1489 | } |
| 1490 | } |
| 1491 | |
| 1492 | void |
| 1493 | BasicNormalizerTest::TestCustomFCC() { |
| 1494 | static const StringPair pairs[]={ |
| 1495 | // ICU 63 normalization with UCPTrie requires inert surrogate code points. |
| 1496 | // { "\\uD801\\uE000\\uDFFE", "" }, |
| 1497 | // { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, |
| 1498 | // { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, |
| 1499 | { "\\uD801\\uE000\\uDFFE", "\\uD801\\uDFFE" }, |
| 1500 | { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD800\\uD801\\uDFFE\\uDFFF" }, |
| 1501 | { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD800\\U000107FE\\uDFFF" }, |
| 1502 | |
| 1503 | // The following expected result is different from CustomComp |
| 1504 | // because of only-contiguous composition. |
| 1505 | { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" }, |
| 1506 | { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, |
| 1507 | { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, |
| 1508 | { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, |
| 1509 | { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } |
| 1510 | }; |
| 1511 | IcuTestErrorCode errorCode(*this, "BasicNormalizerTest/TestCustomFCC"); |
| 1512 | const Normalizer2 *customNorm2= |
| 1513 | Normalizer2::getInstance(loadTestData(errorCode), "testnorm", |
| 1514 | UNORM2_COMPOSE_CONTIGUOUS, errorCode); |
| 1515 | if(errorCode.errDataIfFailureAndReset("unable to load testdata/testnorm.nrm")) { |
| 1516 | return; |
| 1517 | } |
| 1518 | for(int32_t i=0; i<UPRV_LENGTHOF(pairs); ++i) { |
| 1519 | const StringPair &pair=pairs[i]; |
| 1520 | UnicodeString input=UnicodeString(pair.input, -1, US_INV).unescape(); |
| 1521 | UnicodeString expected=UnicodeString(pair.expected, -1, US_INV).unescape(); |
| 1522 | UnicodeString result=customNorm2->normalize(input, errorCode); |
| 1523 | if(result!=expected) { |
| 1524 | errln("custom FCC Normalizer2 did not normalize input %d as expected", i); |
| 1525 | } |
| 1526 | } |
| 1527 | } |
| 1528 | |
| 1529 | /* Improve code coverage of Normalizer2 */ |
| 1530 | void |
| 1531 | BasicNormalizerTest::TestFilteredNormalizer2Coverage() { |
| 1532 | UErrorCode errorCode = U_ZERO_ERROR; |
| 1533 | const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); |
| 1534 | if (U_FAILURE(errorCode)) { |
| 1535 | dataerrln("Normalizer2::getNFCInstance() call failed - %s", u_errorName(errorCode)); |
| 1536 | return; |
| 1537 | } |
| 1538 | UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff\\u0310-\\u031f]"), errorCode); |
| 1539 | FilteredNormalizer2 fn2(*nfcNorm2, filter); |
| 1540 | |
| 1541 | UChar32 char32 = 0x0054; |
| 1542 | |
| 1543 | if (fn2.isInert(char32)) { |
| 1544 | errln("FilteredNormalizer2.isInert() failed."); |
| 1545 | } |
| 1546 | |
| 1547 | if (fn2.hasBoundaryAfter(char32)) { |
| 1548 | errln("FilteredNormalizer2.hasBoundaryAfter() failed."); |
| 1549 | } |
| 1550 | |
| 1551 | UChar32 c; |
| 1552 | for(c=0; c<=0x3ff; ++c) { |
| 1553 | uint8_t expectedCC= filter.contains(c) ? nfcNorm2->getCombiningClass(c) : 0; |
| 1554 | uint8_t cc=fn2.getCombiningClass(c); |
| 1555 | if(cc!=expectedCC) { |
| 1556 | errln( |
| 1557 | UnicodeString("FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+")+ |
| 1558 | hex(c)+ |
| 1559 | ")==filtered NFC.getCC()"); |
| 1560 | } |
| 1561 | } |
| 1562 | |
| 1563 | UnicodeString newString1 = UNICODE_STRING_SIMPLE("[^\\u0100-\\u01ff]"); |
| 1564 | UnicodeString newString2 = UNICODE_STRING_SIMPLE("[^\\u0200-\\u02ff]"); |
| 1565 | fn2.append(newString1, newString2, errorCode); |
| 1566 | if (U_FAILURE(errorCode)) { |
| 1567 | errln("FilteredNormalizer2.append() failed."); |
| 1568 | } |
| 1569 | } |
| 1570 | |
| 1571 | void |
| 1572 | BasicNormalizerTest::TestComposeUTF8WithEdits() { |
| 1573 | IcuTestErrorCode errorCode(*this, "TestComposeUTF8WithEdits"); |
| 1574 | const Normalizer2 *nfkc_cf=Normalizer2::getNFKCCasefoldInstance(errorCode); |
| 1575 | if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) { |
| 1576 | return; |
| 1577 | } |
| 1578 | static const StringPiece src = |
| 1579 | u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 "; |
| 1580 | StringPiece expected = u8" aääạ\u0308ạ\u0308,가각갃 "; |
| 1581 | std::string result; |
| 1582 | StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length())); |
| 1583 | Edits edits; |
| 1584 | nfkc_cf->normalizeUTF8(0, src, sink, &edits, errorCode); |
| 1585 | assertSuccess("normalizeUTF8 with Edits", errorCode.get()); |
| 1586 | assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str()); |
| 1587 | static const EditChange expectedChanges[] = { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1588 | { false, 2, 2 }, // 2 spaces |
| 1589 | { true, 1, 1 }, // A→a |
| 1590 | { true, 2, 2 }, // Ä→ä |
| 1591 | { true, 3, 2 }, // A\u0308→ä |
| 1592 | { true, 7, 5 }, // A\u0308\u00ad\u0323→ạ\u0308 removes the soft hyphen |
| 1593 | { true, 4, 5 }, // Ä\u0323→ạ\u0308 |
| 1594 | { false, 1, 1 }, // comma |
| 1595 | { true, 2, 0 }, // U+00AD soft hyphen maps to empty |
| 1596 | { true, 6, 3 }, // \u1100\u1161→가 |
| 1597 | { true, 6, 3 }, // 가\u11A8→각 |
| 1598 | { true, 6, 3 }, // 가\u3133→갃 |
| 1599 | { false, 2, 2 } // 2 spaces |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1600 | }; |
| 1601 | assertTrue("normalizeUTF8 with Edits hasChanges", edits.hasChanges()); |
| 1602 | assertEquals("normalizeUTF8 with Edits numberOfChanges", 9, edits.numberOfChanges()); |
| 1603 | TestUtility::checkEditsIter(*this, u"normalizeUTF8 with Edits", |
| 1604 | edits.getFineIterator(), edits.getFineIterator(), |
| 1605 | expectedChanges, UPRV_LENGTHOF(expectedChanges), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1606 | true, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1607 | |
| 1608 | assertFalse("isNormalizedUTF8(source)", nfkc_cf->isNormalizedUTF8(src, errorCode)); |
| 1609 | assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode)); |
| 1610 | |
| 1611 | // Omit unchanged text. |
| 1612 | expected = u8"aääạ\u0308ạ\u0308가각갃"; |
| 1613 | result.clear(); |
| 1614 | edits.reset(); |
| 1615 | nfkc_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode); |
| 1616 | assertSuccess("normalizeUTF8 omit unchanged", errorCode.get()); |
| 1617 | assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str()); |
| 1618 | assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges()); |
| 1619 | assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 9, edits.numberOfChanges()); |
| 1620 | TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged", |
| 1621 | edits.getFineIterator(), edits.getFineIterator(), |
| 1622 | expectedChanges, UPRV_LENGTHOF(expectedChanges), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1623 | true, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1624 | |
| 1625 | // With filter: The normalization code does not see the "A" substrings. |
| 1626 | UnicodeSet filter(u"[^A]", errorCode); |
| 1627 | FilteredNormalizer2 fn2(*nfkc_cf, filter); |
| 1628 | expected = u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 "; |
| 1629 | result.clear(); |
| 1630 | edits.reset(); |
| 1631 | fn2.normalizeUTF8(0, src, sink, &edits, errorCode); |
| 1632 | assertSuccess("filtered normalizeUTF8", errorCode.get()); |
| 1633 | assertEquals("filtered normalizeUTF8", expected.data(), result.c_str()); |
| 1634 | static const EditChange filteredChanges[] = { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1635 | { false, 3, 3 }, // 2 spaces + A |
| 1636 | { true, 2, 2 }, // Ä→ä |
| 1637 | { false, 4, 4 }, // A\u0308A |
| 1638 | { true, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen |
| 1639 | { true, 4, 5 }, // Ä\u0323→ạ\u0308 |
| 1640 | { false, 1, 1 }, // comma |
| 1641 | { true, 2, 0 }, // U+00AD soft hyphen maps to empty |
| 1642 | { true, 6, 3 }, // \u1100\u1161→가 |
| 1643 | { true, 6, 3 }, // 가\u11A8→각 |
| 1644 | { true, 6, 3 }, // 가\u3133→갃 |
| 1645 | { false, 2, 2 } // 2 spaces |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1646 | }; |
| 1647 | assertTrue("filtered normalizeUTF8 hasChanges", edits.hasChanges()); |
| 1648 | assertEquals("filtered normalizeUTF8 numberOfChanges", 7, edits.numberOfChanges()); |
| 1649 | TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8", |
| 1650 | edits.getFineIterator(), edits.getFineIterator(), |
| 1651 | filteredChanges, UPRV_LENGTHOF(filteredChanges), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1652 | true, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1653 | |
| 1654 | assertFalse("filtered isNormalizedUTF8(source)", fn2.isNormalizedUTF8(src, errorCode)); |
| 1655 | assertTrue("filtered isNormalizedUTF8(normalized)", fn2.isNormalizedUTF8(result, errorCode)); |
| 1656 | |
| 1657 | // Omit unchanged text. |
| 1658 | // Note that the result is not normalized because the inner normalizer |
| 1659 | // does not see text across filter spans. |
| 1660 | expected = u8"ä\u0323\u0308ạ\u0308가각갃"; |
| 1661 | result.clear(); |
| 1662 | edits.reset(); |
| 1663 | fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode); |
| 1664 | assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get()); |
| 1665 | assertEquals("filtered normalizeUTF8 omit unchanged", expected.data(), result.c_str()); |
| 1666 | assertTrue("filtered normalizeUTF8 omit unchanged hasChanges", edits.hasChanges()); |
| 1667 | assertEquals("filtered normalizeUTF8 omit unchanged numberOfChanges", 7, edits.numberOfChanges()); |
| 1668 | TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged", |
| 1669 | edits.getFineIterator(), edits.getFineIterator(), |
| 1670 | filteredChanges, UPRV_LENGTHOF(filteredChanges), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1671 | true, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1672 | } |
| 1673 | |
| 1674 | void |
| 1675 | BasicNormalizerTest::TestDecomposeUTF8WithEdits() { |
| 1676 | IcuTestErrorCode errorCode(*this, "TestDecomposeUTF8WithEdits"); |
| 1677 | const Normalizer2 *nfkd_cf = |
| 1678 | Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_DECOMPOSE, errorCode); |
| 1679 | if(errorCode.errDataIfFailureAndReset("Normalizer2::getInstance(nfkc_cf/decompose) call failed")) { |
| 1680 | return; |
| 1681 | } |
| 1682 | static const StringPiece src = |
| 1683 | u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 "; |
| 1684 | StringPiece expected = |
| 1685 | u8" aa\u0308a\u0308a\u0323\u0308a\u0323\u0308," |
| 1686 | u8"\u1100\u1161\u1100\u1161\u11A8\u1100\u1161\u11AA "; |
| 1687 | std::string result; |
| 1688 | StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length())); |
| 1689 | Edits edits; |
| 1690 | nfkd_cf->normalizeUTF8(0, src, sink, &edits, errorCode); |
| 1691 | assertSuccess("normalizeUTF8 with Edits", errorCode.get()); |
| 1692 | assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str()); |
| 1693 | static const EditChange expectedChanges[] = { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1694 | { false, 2, 2 }, // 2 spaces |
| 1695 | { true, 1, 1 }, // A→a |
| 1696 | { true, 2, 3 }, // Ä→a\u0308 |
| 1697 | { true, 1, 1 }, // A→a |
| 1698 | { false, 2, 2 }, // \u0308→\u0308 unchanged |
| 1699 | { true, 1, 1 }, // A→a |
| 1700 | { true, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen |
| 1701 | { true, 4, 5 }, // Ä\u0323→a\u0323\u0308 |
| 1702 | { false, 1, 1 }, // comma |
| 1703 | { true, 2, 0 }, // U+00AD soft hyphen maps to empty |
| 1704 | { false, 6, 6 }, // \u1100\u1161 unchanged |
| 1705 | { true, 3, 6 }, // 가→\u1100\u1161 |
| 1706 | { false, 3, 3 }, // \u11A8 unchanged |
| 1707 | { true, 3, 6 }, // 가→\u1100\u1161 |
| 1708 | { true, 3, 3 }, // \u3133→\u11AA |
| 1709 | { false, 2, 2 } // 2 spaces |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1710 | }; |
| 1711 | assertTrue("normalizeUTF8 with Edits hasChanges", edits.hasChanges()); |
| 1712 | assertEquals("normalizeUTF8 with Edits numberOfChanges", 10, edits.numberOfChanges()); |
| 1713 | TestUtility::checkEditsIter(*this, u"normalizeUTF8 with Edits", |
| 1714 | edits.getFineIterator(), edits.getFineIterator(), |
| 1715 | expectedChanges, UPRV_LENGTHOF(expectedChanges), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1716 | true, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1717 | |
| 1718 | assertFalse("isNormalizedUTF8(source)", nfkd_cf->isNormalizedUTF8(src, errorCode)); |
| 1719 | assertTrue("isNormalizedUTF8(normalized)", nfkd_cf->isNormalizedUTF8(result, errorCode)); |
| 1720 | |
| 1721 | // Omit unchanged text. |
| 1722 | expected = u8"aa\u0308aa\u0323\u0308a\u0323\u0308\u1100\u1161\u1100\u1161\u11AA"; |
| 1723 | result.clear(); |
| 1724 | edits.reset(); |
| 1725 | nfkd_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode); |
| 1726 | assertSuccess("normalizeUTF8 omit unchanged", errorCode.get()); |
| 1727 | assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str()); |
| 1728 | assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges()); |
| 1729 | assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 10, edits.numberOfChanges()); |
| 1730 | TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged", |
| 1731 | edits.getFineIterator(), edits.getFineIterator(), |
| 1732 | expectedChanges, UPRV_LENGTHOF(expectedChanges), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1733 | true, errorCode); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1734 | |
| 1735 | // Not testing FilteredNormalizer2: |
| 1736 | // The code there is the same for all normalization modes, and |
| 1737 | // TestComposeUTF8WithEdits() covers it well. |
| 1738 | } |
| 1739 | |
| 1740 | void |
| 1741 | BasicNormalizerTest::TestLowMappingToEmpty_D() { |
| 1742 | IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_D"); |
| 1743 | const Normalizer2 *n2 = Normalizer2::getInstance( |
| 1744 | nullptr, "nfkc_cf", UNORM2_DECOMPOSE, errorCode); |
| 1745 | if (errorCode.errDataIfFailureAndReset("Normalizer2::getInstance() call failed")) { |
| 1746 | return; |
| 1747 | } |
| 1748 | checkLowMappingToEmpty(*n2); |
| 1749 | |
| 1750 | UnicodeString sh(u'\u00AD'); |
| 1751 | assertFalse("soft hyphen is not normalized", n2->isNormalized(sh, errorCode)); |
| 1752 | UnicodeString result = n2->normalize(sh, errorCode); |
| 1753 | assertTrue("soft hyphen normalizes to empty", result.isEmpty()); |
| 1754 | assertEquals("soft hyphen QC=No", UNORM_NO, n2->quickCheck(sh, errorCode)); |
| 1755 | assertEquals("soft hyphen spanQuickCheckYes", 0, n2->spanQuickCheckYes(sh, errorCode)); |
| 1756 | |
| 1757 | UnicodeString s(u"\u00ADÄ\u00AD\u0323"); |
| 1758 | result = n2->normalize(s, errorCode); |
| 1759 | assertEquals("normalize string with soft hyphens", u"a\u0323\u0308", result); |
| 1760 | } |
| 1761 | |
| 1762 | void |
| 1763 | BasicNormalizerTest::TestLowMappingToEmpty_FCD() { |
| 1764 | IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_FCD"); |
| 1765 | const Normalizer2 *n2 = Normalizer2::getInstance( |
| 1766 | nullptr, "nfkc_cf", UNORM2_FCD, errorCode); |
| 1767 | if (errorCode.errDataIfFailureAndReset("Normalizer2::getInstance() call failed")) { |
| 1768 | return; |
| 1769 | } |
| 1770 | checkLowMappingToEmpty(*n2); |
| 1771 | |
| 1772 | UnicodeString sh(u'\u00AD'); |
| 1773 | assertTrue("soft hyphen is FCD", n2->isNormalized(sh, errorCode)); |
| 1774 | |
| 1775 | UnicodeString s(u"\u00ADÄ\u00AD\u0323"); |
| 1776 | UnicodeString result = n2->normalize(s, errorCode); |
| 1777 | assertEquals("normalize string with soft hyphens", u"\u00ADa\u0323\u0308", result); |
| 1778 | } |
| 1779 | |
| 1780 | void |
| 1781 | BasicNormalizerTest::checkLowMappingToEmpty(const Normalizer2 &n2) { |
| 1782 | UnicodeString mapping; |
| 1783 | assertTrue("getDecomposition(soft hyphen)", n2.getDecomposition(0xad, mapping)); |
| 1784 | assertTrue("soft hyphen maps to empty", mapping.isEmpty()); |
| 1785 | assertFalse("soft hyphen has no boundary before", n2.hasBoundaryBefore(0xad)); |
| 1786 | assertFalse("soft hyphen has no boundary after", n2.hasBoundaryAfter(0xad)); |
| 1787 | assertFalse("soft hyphen is not inert", n2.isInert(0xad)); |
| 1788 | } |
| 1789 | |
| 1790 | void |
| 1791 | BasicNormalizerTest::TestNormalizeIllFormedText() { |
| 1792 | IcuTestErrorCode errorCode(*this, "TestNormalizeIllFormedText"); |
| 1793 | const Normalizer2 *nfkc_cf = Normalizer2::getNFKCCasefoldInstance(errorCode); |
| 1794 | if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) { |
| 1795 | return; |
| 1796 | } |
| 1797 | // Normalization behavior for ill-formed text is not defined. |
| 1798 | // ICU currently treats ill-formed sequences as normalization-inert |
| 1799 | // and copies them unchanged. |
| 1800 | UnicodeString src(u" A"); |
| 1801 | src.append((char16_t)0xD800).append(u"ÄA\u0308").append((char16_t)0xD900). |
| 1802 | append(u"A\u0308\u00ad\u0323").append((char16_t)0xDBFF). |
| 1803 | append(u"Ä\u0323,\u00ad").append((char16_t)0xDC00). |
| 1804 | append(u"\u1100\u1161가\u11A8가\u3133 ").append((char16_t)0xDFFF); |
| 1805 | UnicodeString expected(u" a"); |
| 1806 | expected.append((char16_t)0xD800).append(u"ää").append((char16_t)0xD900). |
| 1807 | append(u"ạ\u0308").append((char16_t)0xDBFF). |
| 1808 | append(u"ạ\u0308,").append((char16_t)0xDC00). |
| 1809 | append(u"가각갃 ").append((char16_t)0xDFFF); |
| 1810 | UnicodeString result = nfkc_cf->normalize(src, errorCode); |
| 1811 | assertSuccess("normalize", errorCode.get()); |
| 1812 | assertEquals("normalize", expected, result); |
| 1813 | |
| 1814 | std::string src8(reinterpret_cast<const char*>(u8" A")); |
| 1815 | src8.append("\x80").append(reinterpret_cast<const char*>(u8"ÄA\u0308")).append("\xC0\x80"). |
| 1816 | append(reinterpret_cast<const char*>(u8"A\u0308\u00ad\u0323")).append("\xED\xA0\x80"). |
| 1817 | append(reinterpret_cast<const char*>(u8"Ä\u0323,\u00ad")).append("\xF4\x90\x80\x80"). |
| 1818 | append(reinterpret_cast<const char*>(u8"\u1100\u1161가\u11A8가\u3133 ")).append("\xF0"); |
| 1819 | std::string expected8(reinterpret_cast<const char*>(u8" a")); |
| 1820 | expected8.append("\x80").append(reinterpret_cast<const char*>(u8"ää")).append("\xC0\x80"). |
| 1821 | append(reinterpret_cast<const char*>(u8"ạ\u0308")).append("\xED\xA0\x80"). |
| 1822 | append(reinterpret_cast<const char*>(u8"ạ\u0308,")).append("\xF4\x90\x80\x80"). |
| 1823 | append(reinterpret_cast<const char*>(u8"가각갃 ")).append("\xF0"); |
| 1824 | std::string result8; |
| 1825 | StringByteSink<std::string> sink(&result8); |
| 1826 | nfkc_cf->normalizeUTF8(0, src8, sink, nullptr, errorCode); |
| 1827 | assertSuccess("normalizeUTF8", errorCode.get()); |
| 1828 | assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str()); |
| 1829 | } |
| 1830 | |
| 1831 | void |
| 1832 | BasicNormalizerTest::TestComposeJamoTBase() { |
| 1833 | // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7 |
| 1834 | // which is not a conjoining Jamo Trailing consonant. |
| 1835 | IcuTestErrorCode errorCode(*this, "TestComposeJamoTBase"); |
| 1836 | const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode); |
| 1837 | if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) { |
| 1838 | return; |
| 1839 | } |
| 1840 | UnicodeString s(u"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"); |
| 1841 | UnicodeString expected(u"가\u11A7가\u11A7가\u11A7"); |
| 1842 | UnicodeString result = nfkc->normalize(s, errorCode); |
| 1843 | assertSuccess("normalize(LV+11A7)", errorCode.get()); |
| 1844 | assertEquals("normalize(LV+11A7)", expected, result); |
| 1845 | assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode)); |
| 1846 | assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode)); |
| 1847 | |
| 1848 | StringPiece s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"); |
| 1849 | StringPiece expected8(u8"가\u11A7가\u11A7가\u11A7"); |
| 1850 | std::string result8; |
| 1851 | StringByteSink<std::string> sink(&result8, expected8.length()); |
| 1852 | nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode); |
| 1853 | assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get()); |
| 1854 | assertEquals("normalizeUTF8(LV+11A7)", expected8.data(), result8.c_str()); |
| 1855 | assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode)); |
| 1856 | assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode)); |
| 1857 | } |
| 1858 | |
| 1859 | void |
| 1860 | BasicNormalizerTest::TestComposeBoundaryAfter() { |
| 1861 | IcuTestErrorCode errorCode(*this, "TestComposeBoundaryAfter"); |
| 1862 | const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode); |
| 1863 | if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) { |
| 1864 | return; |
| 1865 | } |
| 1866 | // U+02DA and U+FB2C do not have compose-boundaries-after. |
| 1867 | UnicodeString s(u"\u02DA\u0339 \uFB2C\u05B6"); |
| 1868 | UnicodeString expected(u" \u0339\u030A \u05E9\u05B6\u05BC\u05C1"); |
| 1869 | UnicodeString result = nfkc->normalize(s, errorCode); |
| 1870 | assertSuccess("nfkc", errorCode.get()); |
| 1871 | assertEquals("nfkc", expected, result); |
| 1872 | assertFalse("U+02DA boundary-after", nfkc->hasBoundaryAfter(0x2DA)); |
| 1873 | assertFalse("U+FB2C boundary-after", nfkc->hasBoundaryAfter(0xFB2C)); |
| 1874 | } |
| 1875 | |
| 1876 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |