Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /******************************************************************** |
| 4 | * COPYRIGHT: |
| 5 | * Copyright (c) 2002-2016, International Business Machines Corporation and |
| 6 | * others. All Rights Reserved. |
| 7 | ******************************************************************** |
| 8 | * |
| 9 | * @author Mark E. Davis |
| 10 | * @author Vladimir Weinstein |
| 11 | */ |
| 12 | |
| 13 | #include "unicode/utypes.h" |
| 14 | |
| 15 | #if !UCONFIG_NO_NORMALIZATION |
| 16 | |
| 17 | #include "intltest.h" |
| 18 | #include "cmemory.h" |
| 19 | #include "cstring.h" |
| 20 | #include "canittst.h" |
| 21 | #include "unicode/caniter.h" |
| 22 | #include "unicode/normlzr.h" |
| 23 | #include "unicode/uchar.h" |
| 24 | #include "hash.h" |
| 25 | |
| 26 | #define CASE(id,test) case id: \ |
| 27 | name = #test; \ |
| 28 | if (exec) { \ |
| 29 | logln(#test "---"); \ |
| 30 | logln((UnicodeString)""); \ |
| 31 | test(); \ |
| 32 | } \ |
| 33 | break |
| 34 | |
| 35 | void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec, |
| 36 | const char* &name, char* /*par*/) { |
| 37 | switch (index) { |
| 38 | CASE(0, TestBasic); |
| 39 | CASE(1, TestExhaustive); |
| 40 | CASE(2, TestAPI); |
| 41 | default: name = ""; break; |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | /** |
| 46 | * Convert Java-style strings with \u Unicode escapes into UnicodeString objects |
| 47 | static UnicodeString str(const char *input) |
| 48 | { |
| 49 | UnicodeString str(input, ""); // Invariant conversion |
| 50 | return str.unescape(); |
| 51 | } |
| 52 | */ |
| 53 | |
| 54 | |
| 55 | CanonicalIteratorTest::CanonicalIteratorTest() : |
| 56 | nameTrans(NULL), hexTrans(NULL) |
| 57 | { |
| 58 | } |
| 59 | |
| 60 | CanonicalIteratorTest::~CanonicalIteratorTest() |
| 61 | { |
| 62 | #if !UCONFIG_NO_TRANSLITERATION |
| 63 | if(nameTrans != NULL) { |
| 64 | delete(nameTrans); |
| 65 | } |
| 66 | if(hexTrans != NULL) { |
| 67 | delete(hexTrans); |
| 68 | } |
| 69 | #endif |
| 70 | } |
| 71 | |
| 72 | void CanonicalIteratorTest::TestExhaustive() { |
| 73 | UErrorCode status = U_ZERO_ERROR; |
| 74 | CanonicalIterator it("", status); |
| 75 | if (U_FAILURE(status)) { |
| 76 | dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); |
| 77 | return; |
| 78 | } |
| 79 | UChar32 i = 0; |
| 80 | UnicodeString s; |
| 81 | // Test static and dynamic class IDs |
| 82 | if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ |
| 83 | errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID"); |
| 84 | } |
| 85 | for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) { |
| 86 | //for (i = 0xae00; i < 0xaf00; ++i) { |
| 87 | |
| 88 | if ((i % 0x100) == 0) { |
| 89 | logln("Testing U+%06X", i); |
| 90 | } |
| 91 | |
| 92 | // skip characters we know don't have decomps |
| 93 | int8_t type = u_charType(i); |
| 94 | if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR |
| 95 | || type == U_SURROGATE) continue; |
| 96 | |
| 97 | s = i; |
| 98 | characterTest(s, i, it); |
| 99 | |
| 100 | s += (UChar32)0x0345; //"\\u0345"; |
| 101 | characterTest(s, i, it); |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | void CanonicalIteratorTest::TestBasic() { |
| 106 | |
| 107 | UErrorCode status = U_ZERO_ERROR; |
| 108 | |
| 109 | static const char * const testArray[][2] = { |
| 110 | {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " |
| 111 | "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " |
| 112 | "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " |
| 113 | "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, |
| 114 | {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, |
| 115 | {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, |
| 116 | }; |
| 117 | |
| 118 | #if 0 |
| 119 | // This is not interesting for C/C++ as the data is already built beforehand |
| 120 | // check build |
| 121 | UnicodeSet ss = CanonicalIterator.getSafeStart(); |
| 122 | logln("Safe Start: " + ss.toPattern(true)); |
| 123 | ss = CanonicalIterator.getStarts('a'); |
| 124 | expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), |
| 125 | new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" |
| 126 | + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") |
| 127 | ); |
| 128 | #endif |
| 129 | |
| 130 | // check permute |
| 131 | // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! |
| 132 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 133 | Hashtable *permutations = new Hashtable(false, status); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 134 | permutations->setValueDeleter(uprv_deleteUObject); |
| 135 | UnicodeString toPermute("ABC"); |
| 136 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 137 | CanonicalIterator::permute(toPermute, false, permutations, status); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 138 | |
| 139 | logln("testing permutation"); |
| 140 | |
| 141 | expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); |
| 142 | |
| 143 | delete permutations; |
| 144 | |
| 145 | // try samples |
| 146 | logln("testing samples"); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 147 | Hashtable *set = new Hashtable(false, status); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 148 | set->setValueDeleter(uprv_deleteUObject); |
| 149 | int32_t i = 0; |
| 150 | CanonicalIterator it("", status); |
| 151 | if(U_SUCCESS(status)) { |
| 152 | for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) { |
| 153 | //logln("Results for: " + name.transliterate(testArray[i])); |
| 154 | UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); |
| 155 | it.setSource(testStr, status); |
| 156 | set->removeAll(); |
| 157 | for (;;) { |
| 158 | //UnicodeString *result = new UnicodeString(it.next()); |
| 159 | UnicodeString result(it.next()); |
| 160 | if (result.isBogus()) { |
| 161 | break; |
| 162 | } |
| 163 | set->put(result, new UnicodeString(result), status); // Add result to the table |
| 164 | //logln(++counter + ": " + hex.transliterate(result)); |
| 165 | //logln(" = " + name.transliterate(result)); |
| 166 | } |
| 167 | expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); |
| 168 | |
| 169 | } |
| 170 | } else { |
| 171 | dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); |
| 172 | } |
| 173 | delete set; |
| 174 | } |
| 175 | |
| 176 | void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it) |
| 177 | { |
| 178 | UErrorCode status = U_ZERO_ERROR; |
| 179 | UnicodeString decomp, comp; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 180 | UBool gotDecomp = false; |
| 181 | UBool gotComp = false; |
| 182 | UBool gotSource = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 183 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 184 | Normalizer::decompose(s, false, 0, decomp, status); |
| 185 | Normalizer::compose(s, false, 0, comp, status); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 186 | |
| 187 | // skip characters that don't have either decomp. |
| 188 | // need quick test for this! |
| 189 | if (s == decomp && s == comp) { |
| 190 | return; |
| 191 | } |
| 192 | |
| 193 | it.setSource(s, status); |
| 194 | |
| 195 | for (;;) { |
| 196 | UnicodeString item = it.next(); |
| 197 | if (item.isBogus()) break; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 198 | if (item == s) gotSource = true; |
| 199 | if (item == decomp) gotDecomp = true; |
| 200 | if (item == comp) gotComp = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 201 | } |
| 202 | |
| 203 | if (!gotSource || !gotDecomp || !gotComp) { |
| 204 | errln("FAIL CanonicalIterator: " + s + (int)ch); |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) { |
| 209 | if (!(a==b)) { |
| 210 | errln("FAIL: " + message + getReadable(item)); |
| 211 | errln("\t" + getReadable(a)); |
| 212 | errln("\t" + getReadable(b)); |
| 213 | } else { |
| 214 | logln("Checked: " + message + getReadable(item)); |
| 215 | logln("\t" + getReadable(a)); |
| 216 | logln("\t" + getReadable(b)); |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) { |
| 221 | UErrorCode status = U_ZERO_ERROR; |
| 222 | UnicodeString result = "["; |
| 223 | if (s.length() == 0) return ""; |
| 224 | // set up for readable display |
| 225 | #if !UCONFIG_NO_TRANSLITERATION |
| 226 | if(verbose) { |
| 227 | if (nameTrans == NULL) |
| 228 | nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status); |
| 229 | UnicodeString sName = s; |
| 230 | nameTrans->transliterate(sName); |
| 231 | result += sName; |
| 232 | result += ";"; |
| 233 | } |
| 234 | if (hexTrans == NULL) |
| 235 | hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status); |
| 236 | #endif |
| 237 | UnicodeString sHex = s; |
| 238 | #if !UCONFIG_NO_TRANSLITERATION |
| 239 | if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated |
| 240 | hexTrans->transliterate(sHex); |
| 241 | } |
| 242 | #endif |
| 243 | result += sHex; |
| 244 | result += "]"; |
| 245 | return result; |
| 246 | //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]"; |
| 247 | } |
| 248 | |
| 249 | U_CFUNC int U_CALLCONV |
| 250 | compareUnicodeStrings(const void *s1, const void *s2) { |
| 251 | UnicodeString **st1 = (UnicodeString **)s1; |
| 252 | UnicodeString **st2 = (UnicodeString **)s2; |
| 253 | |
| 254 | return (*st1)->compare(**st2); |
| 255 | } |
| 256 | |
| 257 | |
| 258 | UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) { |
| 259 | UnicodeString result; |
| 260 | |
| 261 | // Iterate over the Hashtable, then qsort. |
| 262 | |
| 263 | UnicodeString **resArray = new UnicodeString*[col->count()]; |
| 264 | int32_t i = 0; |
| 265 | |
| 266 | const UHashElement *ne = NULL; |
| 267 | int32_t el = UHASH_FIRST; |
| 268 | //Iterator it = basic.iterator(); |
| 269 | ne = col->nextElement(el); |
| 270 | //while (it.hasNext()) |
| 271 | while (ne != NULL) { |
| 272 | //String item = (String) it.next(); |
| 273 | UnicodeString *item = (UnicodeString *)(ne->value.pointer); |
| 274 | resArray[i++] = item; |
| 275 | ne = col->nextElement(el); |
| 276 | } |
| 277 | |
| 278 | for(i = 0; i<col->count(); ++i) { |
| 279 | logln(*resArray[i]); |
| 280 | } |
| 281 | |
| 282 | qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings); |
| 283 | |
| 284 | result = *resArray[0]; |
| 285 | |
| 286 | for(i = 1; i<col->count(); ++i) { |
| 287 | result += ", "; |
| 288 | result += *resArray[i]; |
| 289 | } |
| 290 | |
| 291 | /* |
| 292 | Iterator it = col.iterator(); |
| 293 | while (it.hasNext()) { |
| 294 | if (result.length() != 0) result.append(", "); |
| 295 | result.append(it.next().toString()); |
| 296 | } |
| 297 | */ |
| 298 | |
| 299 | delete [] resArray; |
| 300 | |
| 301 | return result; |
| 302 | } |
| 303 | |
| 304 | void CanonicalIteratorTest::TestAPI() { |
| 305 | UErrorCode status = U_ZERO_ERROR; |
| 306 | // Test reset and getSource |
| 307 | UnicodeString start("ljubav"); |
| 308 | logln("Testing CanonicalIterator::getSource"); |
| 309 | logln("Instantiating canonical iterator with string "+start); |
| 310 | CanonicalIterator can(start, status); |
| 311 | if (U_FAILURE(status)) { |
| 312 | dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); |
| 313 | return; |
| 314 | } |
| 315 | UnicodeString source = can.getSource(); |
| 316 | logln("CanonicalIterator::getSource returned "+source); |
| 317 | if(start != source) { |
| 318 | errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source); |
| 319 | } |
| 320 | logln("Testing CanonicalIterator::reset"); |
| 321 | UnicodeString next = can.next(); |
| 322 | logln("CanonicalIterator::next returned "+next); |
| 323 | |
| 324 | can.reset(); |
| 325 | |
| 326 | UnicodeString afterReset = can.next(); |
| 327 | logln("After reset, CanonicalIterator::next returned "+afterReset); |
| 328 | |
| 329 | if(next != afterReset) { |
| 330 | errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+")."); |
| 331 | } |
| 332 | |
| 333 | logln("Testing getStaticClassID and getDynamicClassID"); |
| 334 | if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ |
| 335 | errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID"); |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |