Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /******************************************************************** |
| 4 | * Copyright (c) 1997-2016, International Business Machines Corporation and |
| 5 | * others. All Rights Reserved. |
| 6 | ********************************************************************/ |
| 7 | |
| 8 | #include "unicode/ustring.h" |
| 9 | #include "unicode/uchar.h" |
| 10 | #include "unicode/ucpmap.h" |
| 11 | #include "unicode/uniset.h" |
| 12 | #include "unicode/putil.h" |
| 13 | #include "unicode/uscript.h" |
| 14 | #include "unicode/uset.h" |
| 15 | #include "cstring.h" |
| 16 | #include "hash.h" |
| 17 | #include "patternprops.h" |
| 18 | #include "normalizer2impl.h" |
| 19 | #include "testutil.h" |
| 20 | #include "uparse.h" |
| 21 | #include "ucdtest.h" |
| 22 | |
| 23 | static const char *ignorePropNames[]={ |
| 24 | "FC_NFKC", |
| 25 | "NFD_QC", |
| 26 | "NFC_QC", |
| 27 | "NFKD_QC", |
| 28 | "NFKC_QC", |
| 29 | "Expands_On_NFD", |
| 30 | "Expands_On_NFC", |
| 31 | "Expands_On_NFKD", |
| 32 | "Expands_On_NFKC", |
| 33 | "NFKC_CF" |
| 34 | }; |
| 35 | |
| 36 | UnicodeTest::UnicodeTest() |
| 37 | { |
| 38 | UErrorCode errorCode=U_ZERO_ERROR; |
| 39 | unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode); |
| 40 | if(U_FAILURE(errorCode)) { |
| 41 | delete unknownPropertyNames; |
| 42 | unknownPropertyNames=NULL; |
| 43 | } |
| 44 | // Ignore some property names altogether. |
| 45 | for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) { |
| 46 | unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode); |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | UnicodeTest::~UnicodeTest() |
| 51 | { |
| 52 | delete unknownPropertyNames; |
| 53 | } |
| 54 | |
| 55 | void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) |
| 56 | { |
| 57 | if(exec) { |
| 58 | logln("TestSuite UnicodeTest: "); |
| 59 | } |
| 60 | TESTCASE_AUTO_BEGIN; |
| 61 | TESTCASE_AUTO(TestAdditionalProperties); |
| 62 | TESTCASE_AUTO(TestBinaryValues); |
| 63 | TESTCASE_AUTO(TestConsistency); |
| 64 | TESTCASE_AUTO(TestPatternProperties); |
| 65 | TESTCASE_AUTO(TestScriptMetadata); |
| 66 | TESTCASE_AUTO(TestBidiPairedBracketType); |
| 67 | TESTCASE_AUTO(TestEmojiProperties); |
| 68 | TESTCASE_AUTO(TestEmojiPropertiesOfStrings); |
| 69 | TESTCASE_AUTO(TestIndicPositionalCategory); |
| 70 | TESTCASE_AUTO(TestIndicSyllabicCategory); |
| 71 | TESTCASE_AUTO(TestVerticalOrientation); |
| 72 | TESTCASE_AUTO(TestDefaultScriptExtensions); |
| 73 | TESTCASE_AUTO(TestInvalidCodePointFolding); |
| 74 | #if !UCONFIG_NO_NORMALIZATION |
| 75 | TESTCASE_AUTO(TestBinaryCharacterProperties); |
| 76 | TESTCASE_AUTO(TestIntCharacterProperties); |
| 77 | #endif |
| 78 | TESTCASE_AUTO_END; |
| 79 | } |
| 80 | |
| 81 | //==================================================== |
| 82 | // private data used by the tests |
| 83 | //==================================================== |
| 84 | |
| 85 | // test DerivedCoreProperties.txt ------------------------------------------- |
| 86 | |
| 87 | // copied from genprops.c |
| 88 | static int32_t |
| 89 | getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { |
| 90 | const char *t, *z; |
| 91 | int32_t i, j; |
| 92 | |
| 93 | s=u_skipWhitespace(s); |
| 94 | for(i=0; i<countTokens; ++i) { |
| 95 | t=tokens[i]; |
| 96 | if(t!=NULL) { |
| 97 | for(j=0;; ++j) { |
| 98 | if(t[j]!=0) { |
| 99 | if(s[j]!=t[j]) { |
| 100 | break; |
| 101 | } |
| 102 | } else { |
| 103 | z=u_skipWhitespace(s+j); |
| 104 | if(*z==';' || *z==0) { |
| 105 | return i; |
| 106 | } else { |
| 107 | break; |
| 108 | } |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | } |
| 113 | return -1; |
| 114 | } |
| 115 | |
| 116 | static const char *const |
| 117 | derivedPropsNames[]={ |
| 118 | "Math", |
| 119 | "Alphabetic", |
| 120 | "Lowercase", |
| 121 | "Uppercase", |
| 122 | "ID_Start", |
| 123 | "ID_Continue", |
| 124 | "XID_Start", |
| 125 | "XID_Continue", |
| 126 | "Default_Ignorable_Code_Point", |
| 127 | "Full_Composition_Exclusion", |
| 128 | "Grapheme_Extend", |
| 129 | "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */ |
| 130 | "Grapheme_Base", |
| 131 | "Cased", |
| 132 | "Case_Ignorable", |
| 133 | "Changes_When_Lowercased", |
| 134 | "Changes_When_Uppercased", |
| 135 | "Changes_When_Titlecased", |
| 136 | "Changes_When_Casefolded", |
| 137 | "Changes_When_Casemapped", |
| 138 | "Changes_When_NFKC_Casefolded" |
| 139 | }; |
| 140 | |
| 141 | static const UProperty |
| 142 | derivedPropsIndex[]={ |
| 143 | UCHAR_MATH, |
| 144 | UCHAR_ALPHABETIC, |
| 145 | UCHAR_LOWERCASE, |
| 146 | UCHAR_UPPERCASE, |
| 147 | UCHAR_ID_START, |
| 148 | UCHAR_ID_CONTINUE, |
| 149 | UCHAR_XID_START, |
| 150 | UCHAR_XID_CONTINUE, |
| 151 | UCHAR_DEFAULT_IGNORABLE_CODE_POINT, |
| 152 | UCHAR_FULL_COMPOSITION_EXCLUSION, |
| 153 | UCHAR_GRAPHEME_EXTEND, |
| 154 | UCHAR_GRAPHEME_LINK, |
| 155 | UCHAR_GRAPHEME_BASE, |
| 156 | UCHAR_CASED, |
| 157 | UCHAR_CASE_IGNORABLE, |
| 158 | UCHAR_CHANGES_WHEN_LOWERCASED, |
| 159 | UCHAR_CHANGES_WHEN_UPPERCASED, |
| 160 | UCHAR_CHANGES_WHEN_TITLECASED, |
| 161 | UCHAR_CHANGES_WHEN_CASEFOLDED, |
| 162 | UCHAR_CHANGES_WHEN_CASEMAPPED, |
| 163 | UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED |
| 164 | }; |
| 165 | |
| 166 | static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 }; |
| 167 | |
| 168 | enum { MAX_ERRORS=50 }; |
| 169 | |
| 170 | U_CFUNC void U_CALLCONV |
| 171 | derivedPropsLineFn(void *context, |
| 172 | char *fields[][2], int32_t /* fieldCount */, |
| 173 | UErrorCode *pErrorCode) |
| 174 | { |
| 175 | UnicodeTest *me=(UnicodeTest *)context; |
| 176 | uint32_t start, end; |
| 177 | int32_t i; |
| 178 | |
| 179 | u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); |
| 180 | if(U_FAILURE(*pErrorCode)) { |
| 181 | me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]); |
| 182 | return; |
| 183 | } |
| 184 | |
| 185 | /* parse derived binary property name, ignore unknown names */ |
| 186 | i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]); |
| 187 | if(i<0) { |
| 188 | UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0])); |
| 189 | propName.trim(); |
| 190 | if(me->unknownPropertyNames->find(propName)==NULL) { |
| 191 | UErrorCode errorCode=U_ZERO_ERROR; |
| 192 | me->unknownPropertyNames->puti(propName, 1, errorCode); |
| 193 | me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]); |
| 194 | } |
| 195 | return; |
| 196 | } |
| 197 | |
| 198 | me->derivedProps[i].add(start, end); |
| 199 | } |
| 200 | |
| 201 | void UnicodeTest::TestAdditionalProperties() { |
| 202 | #if !UCONFIG_NO_NORMALIZATION |
| 203 | // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt |
| 204 | if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) { |
| 205 | errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n", |
| 206 | UPRV_LENGTHOF(derivedPropsNames)); |
| 207 | return; |
| 208 | } |
| 209 | if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) { |
| 210 | errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n"); |
| 211 | return; |
| 212 | } |
| 213 | |
| 214 | char path[500]; |
| 215 | if(getUnidataPath(path) == NULL) { |
| 216 | errln("unable to find path to source/data/unidata/"); |
| 217 | return; |
| 218 | } |
| 219 | char *basename=strchr(path, 0); |
| 220 | strcpy(basename, "DerivedCoreProperties.txt"); |
| 221 | |
| 222 | char *fields[2][2]; |
| 223 | UErrorCode errorCode=U_ZERO_ERROR; |
| 224 | u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); |
| 225 | if(U_FAILURE(errorCode)) { |
| 226 | errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode)); |
| 227 | return; |
| 228 | } |
| 229 | |
| 230 | strcpy(basename, "DerivedNormalizationProps.txt"); |
| 231 | u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); |
| 232 | if(U_FAILURE(errorCode)) { |
| 233 | errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode)); |
| 234 | return; |
| 235 | } |
| 236 | |
| 237 | // now we have all derived core properties in the UnicodeSets |
| 238 | // run them all through the API |
| 239 | int32_t rangeCount, range; |
| 240 | uint32_t i; |
| 241 | UChar32 start, end; |
| 242 | |
| 243 | // test all TRUE properties |
| 244 | for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { |
| 245 | rangeCount=derivedProps[i].getRangeCount(); |
| 246 | for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { |
| 247 | start=derivedProps[i].getRangeStart(range); |
| 248 | end=derivedProps[i].getRangeEnd(range); |
| 249 | for(; start<=end; ++start) { |
| 250 | if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) { |
| 251 | dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]); |
| 252 | if(++numErrors[i]>=MAX_ERRORS) { |
| 253 | dataerrln("Too many errors, moving to the next test"); |
| 254 | break; |
| 255 | } |
| 256 | } |
| 257 | } |
| 258 | } |
| 259 | } |
| 260 | |
| 261 | // invert all properties |
| 262 | for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { |
| 263 | derivedProps[i].complement(); |
| 264 | } |
| 265 | |
| 266 | // test all FALSE properties |
| 267 | for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { |
| 268 | rangeCount=derivedProps[i].getRangeCount(); |
| 269 | for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { |
| 270 | start=derivedProps[i].getRangeStart(range); |
| 271 | end=derivedProps[i].getRangeEnd(range); |
| 272 | for(; start<=end; ++start) { |
| 273 | if(u_hasBinaryProperty(start, derivedPropsIndex[i])) { |
| 274 | errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]); |
| 275 | if(++numErrors[i]>=MAX_ERRORS) { |
| 276 | errln("Too many errors, moving to the next test"); |
| 277 | break; |
| 278 | } |
| 279 | } |
| 280 | } |
| 281 | } |
| 282 | } |
| 283 | #endif /* !UCONFIG_NO_NORMALIZATION */ |
| 284 | } |
| 285 | |
| 286 | void UnicodeTest::TestBinaryValues() { |
| 287 | /* |
| 288 | * Unicode 5.1 explicitly defines binary property value aliases. |
| 289 | * Verify that they are all recognized. |
| 290 | */ |
| 291 | UErrorCode errorCode=U_ZERO_ERROR; |
| 292 | UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode); |
| 293 | if(U_FAILURE(errorCode)) { |
| 294 | dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode)); |
| 295 | return; |
| 296 | } |
| 297 | |
| 298 | static const char *const falseValues[]={ "N", "No", "F", "False" }; |
| 299 | static const char *const trueValues[]={ "Y", "Yes", "T", "True" }; |
| 300 | int32_t i; |
| 301 | for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) { |
| 302 | UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); |
| 303 | pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV)); |
| 304 | errorCode=U_ZERO_ERROR; |
| 305 | UnicodeSet set(pattern, errorCode); |
| 306 | if(U_FAILURE(errorCode)) { |
| 307 | errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode)); |
| 308 | continue; |
| 309 | } |
| 310 | set.complement(); |
| 311 | if(set!=alpha) { |
| 312 | errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]); |
| 313 | } |
| 314 | } |
| 315 | for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) { |
| 316 | UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); |
| 317 | pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV)); |
| 318 | errorCode=U_ZERO_ERROR; |
| 319 | UnicodeSet set(pattern, errorCode); |
| 320 | if(U_FAILURE(errorCode)) { |
| 321 | errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode)); |
| 322 | continue; |
| 323 | } |
| 324 | if(set!=alpha) { |
| 325 | errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]); |
| 326 | } |
| 327 | } |
| 328 | } |
| 329 | |
| 330 | void UnicodeTest::TestConsistency() { |
| 331 | #if !UCONFIG_NO_NORMALIZATION |
| 332 | /* |
| 333 | * Test for an example that getCanonStartSet() delivers |
| 334 | * all characters that compose from the input one, |
| 335 | * even in multiple steps. |
| 336 | * For example, the set for "I" (0049) should contain both |
| 337 | * I-diaeresis (00CF) and I-diaeresis-acute (1E2E). |
| 338 | * In general, the set for the middle such character should be a subset |
| 339 | * of the set for the first. |
| 340 | */ |
| 341 | IcuTestErrorCode errorCode(*this, "TestConsistency"); |
| 342 | const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode); |
| 343 | const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); |
| 344 | if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) { |
| 345 | dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n", |
| 346 | errorCode.errorName()); |
| 347 | errorCode.reset(); |
| 348 | return; |
| 349 | } |
| 350 | |
| 351 | UnicodeSet set1, set2; |
| 352 | if (nfcImpl->getCanonStartSet(0x49, set1)) { |
| 353 | /* enumerate all characters that are plausible to be latin letters */ |
| 354 | for(UChar start=0xa0; start<0x2000; ++start) { |
| 355 | UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode); |
| 356 | if(decomp.length()>1 && decomp[0]==0x49) { |
| 357 | set2.add(start); |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | if (set1!=set2) { |
| 362 | errln("[canon start set of 0049] != [all c with canon decomp with 0049]"); |
| 363 | } |
| 364 | // This was available in cucdtst.c but the test had to move to intltest |
| 365 | // because the new internal normalization functions are in C++. |
| 366 | //compareUSets(set1, set2, |
| 367 | // "[canon start set of 0049]", "[all c with canon decomp with 0049]", |
| 368 | // TRUE); |
| 369 | } else { |
| 370 | errln("NFC.getCanonStartSet() returned FALSE"); |
| 371 | } |
| 372 | #endif |
| 373 | } |
| 374 | |
| 375 | /** |
| 376 | * Test various implementations of Pattern_Syntax & Pattern_White_Space. |
| 377 | */ |
| 378 | void UnicodeTest::TestPatternProperties() { |
| 379 | IcuTestErrorCode errorCode(*this, "TestPatternProperties()"); |
| 380 | UnicodeSet syn_pp; |
| 381 | UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode); |
| 382 | UnicodeSet syn_list( |
| 383 | "[!-/\\:-@\\[-\\^`\\{-~" |
| 384 | "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7" |
| 385 | "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775" |
| 386 | "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode); |
| 387 | UnicodeSet ws_pp; |
| 388 | UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode); |
| 389 | UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode); |
| 390 | UnicodeSet syn_ws_pp; |
| 391 | UnicodeSet syn_ws_prop(syn_prop); |
| 392 | syn_ws_prop.addAll(ws_prop); |
| 393 | for(UChar32 c=0; c<=0xffff; ++c) { |
| 394 | if(PatternProps::isSyntax(c)) { |
| 395 | syn_pp.add(c); |
| 396 | } |
| 397 | if(PatternProps::isWhiteSpace(c)) { |
| 398 | ws_pp.add(c); |
| 399 | } |
| 400 | if(PatternProps::isSyntaxOrWhiteSpace(c)) { |
| 401 | syn_ws_pp.add(c); |
| 402 | } |
| 403 | } |
| 404 | compareUSets(syn_pp, syn_prop, |
| 405 | "PatternProps.isSyntax()", "[:Pattern_Syntax:]", TRUE); |
| 406 | compareUSets(syn_pp, syn_list, |
| 407 | "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", TRUE); |
| 408 | compareUSets(ws_pp, ws_prop, |
| 409 | "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", TRUE); |
| 410 | compareUSets(ws_pp, ws_list, |
| 411 | "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", TRUE); |
| 412 | compareUSets(syn_ws_pp, syn_ws_prop, |
| 413 | "PatternProps.isSyntaxOrWhiteSpace()", |
| 414 | "[[:Pattern_Syntax:][:Pattern_White_Space:]]", TRUE); |
| 415 | } |
| 416 | |
| 417 | // So far only minimal port of Java & cucdtst.c compareUSets(). |
| 418 | UBool |
| 419 | UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b, |
| 420 | const char *a_name, const char *b_name, |
| 421 | UBool diffIsError) { |
| 422 | UBool same= a==b; |
| 423 | if(!same && diffIsError) { |
| 424 | errln("Sets are different: %s vs. %s\n", a_name, b_name); |
| 425 | } |
| 426 | return same; |
| 427 | } |
| 428 | |
| 429 | namespace { |
| 430 | |
| 431 | /** |
| 432 | * Maps a special script code to the most common script of its encoded characters. |
| 433 | */ |
| 434 | UScriptCode getCharScript(UScriptCode script) { |
| 435 | switch(script) { |
| 436 | case USCRIPT_HAN_WITH_BOPOMOFO: |
| 437 | case USCRIPT_SIMPLIFIED_HAN: |
| 438 | case USCRIPT_TRADITIONAL_HAN: |
| 439 | return USCRIPT_HAN; |
| 440 | case USCRIPT_JAPANESE: |
| 441 | return USCRIPT_HIRAGANA; |
| 442 | case USCRIPT_JAMO: |
| 443 | case USCRIPT_KOREAN: |
| 444 | return USCRIPT_HANGUL; |
| 445 | case USCRIPT_SYMBOLS_EMOJI: |
| 446 | return USCRIPT_SYMBOLS; |
| 447 | default: |
| 448 | return script; |
| 449 | } |
| 450 | } |
| 451 | |
| 452 | } // namespace |
| 453 | |
| 454 | void UnicodeTest::TestScriptMetadata() { |
| 455 | IcuTestErrorCode errorCode(*this, "TestScriptMetadata()"); |
| 456 | UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode); |
| 457 | // So far, sample characters are uppercase. |
| 458 | // Georgian is special. |
| 459 | UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode); |
| 460 | for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) { |
| 461 | UScriptCode sc = (UScriptCode)sci; |
| 462 | // Run the test with -v to see which script has failures: |
| 463 | // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL |
| 464 | logln(uscript_getShortName(sc)); |
| 465 | UScriptUsage usage = uscript_getUsage(sc); |
| 466 | UnicodeString sample = uscript_getSampleUnicodeString(sc); |
| 467 | UnicodeSet scriptSet; |
| 468 | scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode); |
| 469 | if(usage == USCRIPT_USAGE_NOT_ENCODED) { |
| 470 | assertTrue("not encoded, no sample", sample.isEmpty()); |
| 471 | assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc)); |
| 472 | assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc)); |
| 473 | assertFalse("not encoded, not cased", uscript_isCased(sc)); |
| 474 | assertTrue("not encoded, no characters", scriptSet.isEmpty()); |
| 475 | } else { |
| 476 | assertFalse("encoded, has a sample character", sample.isEmpty()); |
| 477 | UChar32 firstChar = sample.char32At(0); |
| 478 | UScriptCode charScript = getCharScript(sc); |
| 479 | assertEquals("script(sample(script))", |
| 480 | (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode)); |
| 481 | assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc)); |
| 482 | assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc)); |
| 483 | assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty())); |
| 484 | if(uscript_isRightToLeft(sc)) { |
| 485 | rtl.removeAll(scriptSet); |
| 486 | } |
| 487 | if(uscript_isCased(sc)) { |
| 488 | cased.removeAll(scriptSet); |
| 489 | } |
| 490 | } |
| 491 | } |
| 492 | UnicodeString pattern; |
| 493 | assertEquals("no remaining RTL characters", |
| 494 | UnicodeString("[]"), rtl.toPattern(pattern)); |
| 495 | assertEquals("no remaining cased characters", |
| 496 | UnicodeString("[]"), cased.toPattern(pattern)); |
| 497 | |
| 498 | assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN)); |
| 499 | assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI)); |
| 500 | assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN)); |
| 501 | } |
| 502 | |
| 503 | void UnicodeTest::TestBidiPairedBracketType() { |
| 504 | // BidiBrackets-6.3.0.txt says: |
| 505 | // |
| 506 | // The set of code points listed in this file was originally derived |
| 507 | // using the character properties General_Category (gc), Bidi_Class (bc), |
| 508 | // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows: |
| 509 | // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe, |
| 510 | // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket |
| 511 | // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type |
| 512 | // property values are Open and Close, respectively. |
| 513 | IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()"); |
| 514 | UnicodeSet bpt("[:^bpt=n:]", errorCode); |
| 515 | assertTrue("bpt!=None is not empty", !bpt.isEmpty()); |
| 516 | // The following should always be true. |
| 517 | UnicodeSet mirrored("[:Bidi_M:]", errorCode); |
| 518 | UnicodeSet other_neutral("[:bc=ON:]", errorCode); |
| 519 | assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt)); |
| 520 | assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt)); |
| 521 | // The following are true at least initially in Unicode 6.3. |
| 522 | UnicodeSet bpt_open("[:bpt=o:]", errorCode); |
| 523 | UnicodeSet bpt_close("[:bpt=c:]", errorCode); |
| 524 | UnicodeSet ps("[:Ps:]", errorCode); |
| 525 | UnicodeSet pe("[:Pe:]", errorCode); |
| 526 | assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open)); |
| 527 | assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close)); |
| 528 | } |
| 529 | |
| 530 | void UnicodeTest::TestEmojiProperties() { |
| 531 | assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI)); |
| 532 | assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI)); |
| 533 | IcuTestErrorCode errorCode(*this, "TestEmojiProperties()"); |
| 534 | UnicodeSet emoji("[:Emoji:]", errorCode); |
| 535 | assertTrue("lots of Emoji", emoji.size() > 700); |
| 536 | |
| 537 | assertTrue("shooting star is Emoji_Presentation", |
| 538 | u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION)); |
| 539 | assertTrue("Fitzpatrick 6 is Emoji_Modifier", |
| 540 | u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER)); |
| 541 | assertTrue("happy person is Emoji_Modifier_Base", |
| 542 | u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE)); |
| 543 | assertTrue("asterisk is Emoji_Component", |
| 544 | u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT)); |
| 545 | assertTrue("copyright is Extended_Pictographic", |
| 546 | u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC)); |
| 547 | } |
| 548 | |
| 549 | namespace { |
| 550 | |
| 551 | UBool hbp(const UChar *s, int32_t length, UProperty which) { |
| 552 | return u_stringHasBinaryProperty(s, length, which); |
| 553 | } |
| 554 | |
| 555 | UBool hbp(const UChar *s, UProperty which) { |
| 556 | return u_stringHasBinaryProperty(s, -1, which); |
| 557 | } |
| 558 | |
| 559 | } // namespace |
| 560 | |
| 561 | void UnicodeTest::TestEmojiPropertiesOfStrings() { |
| 562 | // Property of code points, for coverage |
| 563 | assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC)); |
| 564 | assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC)); |
| 565 | assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC)); |
| 566 | assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC)); |
| 567 | assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC)); |
| 568 | assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC)); |
| 569 | assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC)); |
| 570 | assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC)); |
| 571 | assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC)); |
| 572 | assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC)); |
| 573 | assertFalse("bicycle is not Ideographic", hbp(u"🚲", 2, UCHAR_IDEOGRAPHIC)); |
| 574 | assertFalse("bicycle/0 is not Ideographic", hbp(u"🚲", -1, UCHAR_IDEOGRAPHIC)); |
| 575 | assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC)); |
| 576 | assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC)); |
| 577 | |
| 578 | // Property of (code points and) strings |
| 579 | assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI)); |
| 580 | assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI)); |
| 581 | assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI)); |
| 582 | assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI)); |
| 583 | assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI)); |
| 584 | assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI)); |
| 585 | assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI)); |
| 586 | assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI)); |
| 587 | assertTrue("bicycle is Basic_Emoji", hbp(u"🚲", 2, UCHAR_BASIC_EMOJI)); |
| 588 | assertTrue("bicycle/0 is Basic_Emoji", hbp(u"🚲", -1, UCHAR_BASIC_EMOJI)); |
| 589 | assertFalse("2*bicycle is Basic_Emoji", hbp(u"🚲🚲", 4, UCHAR_BASIC_EMOJI)); |
| 590 | assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"🚲🚲", -1, UCHAR_BASIC_EMOJI)); |
| 591 | assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI)); |
| 592 | assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI)); |
| 593 | |
| 594 | assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI)); |
| 595 | assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI)); |
| 596 | assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI)); |
| 597 | assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI)); |
| 598 | |
| 599 | assertFalse("chipmunk is not Basic_Emoji", hbp(u"🐿", UCHAR_BASIC_EMOJI)); |
| 600 | assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"🐿\uFE0F", UCHAR_BASIC_EMOJI)); |
| 601 | assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"🐿\uFE0F\uFE0F", UCHAR_BASIC_EMOJI)); |
| 602 | |
| 603 | // Properties of strings (only) |
| 604 | assertFalse("4+emoji is not Emoji_Keycap_Sequence", |
| 605 | hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE)); |
| 606 | assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence", |
| 607 | hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE)); |
| 608 | |
| 609 | assertFalse("[B] is not RGI_Emoji_Flag_Sequence", |
| 610 | hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE)); |
| 611 | assertTrue("[BE] is RGI_Emoji_Flag_Sequence", |
| 612 | hbp(u"🇧🇪", UCHAR_RGI_EMOJI_FLAG_SEQUENCE)); |
| 613 | |
| 614 | assertFalse("[flag] is not RGI_Emoji_Tag_Sequence", |
| 615 | hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE)); |
| 616 | assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence", |
| 617 | hbp(u"🏴", UCHAR_RGI_EMOJI_TAG_SEQUENCE)); |
| 618 | |
| 619 | assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence", |
| 620 | hbp(u"🚴", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE)); |
| 621 | assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence", |
| 622 | hbp(u"🚴\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE)); |
| 623 | |
| 624 | assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence", |
| 625 | hbp(u"👩\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE)); |
| 626 | assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence", |
| 627 | hbp(u"👩\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE)); |
| 628 | |
| 629 | // RGI_Emoji = all of the above |
| 630 | assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI)); |
| 631 | assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI)); |
| 632 | |
| 633 | assertFalse("chipmunk is not RGI_Emoji", hbp(u"🐿", UCHAR_RGI_EMOJI)); |
| 634 | assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"🐿\uFE0F", UCHAR_RGI_EMOJI)); |
| 635 | |
| 636 | assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI)); |
| 637 | assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI)); |
| 638 | |
| 639 | assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI)); |
| 640 | assertTrue("[BE] is RGI_Emoji", hbp(u"🇧🇪", UCHAR_RGI_EMOJI)); |
| 641 | |
| 642 | assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI)); |
| 643 | assertTrue("[Scotland] is RGI_Emoji", hbp(u"🏴", UCHAR_RGI_EMOJI)); |
| 644 | |
| 645 | assertTrue("bicyclist is RGI_Emoji", hbp(u"🚴", UCHAR_RGI_EMOJI)); |
| 646 | assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"🚴\U0001F3FD", UCHAR_RGI_EMOJI)); |
| 647 | |
| 648 | assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"👩\U0001F3FF\u200D", UCHAR_RGI_EMOJI)); |
| 649 | assertTrue("woman pilot: dark skin tone is RGI_Emoji", |
| 650 | hbp(u"👩\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI)); |
| 651 | |
| 652 | // UnicodeSet with properties of strings |
| 653 | IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()"); |
| 654 | UnicodeSet basic("[:Basic_Emoji:]", errorCode); |
| 655 | UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode); |
| 656 | UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode); |
| 657 | UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode); |
| 658 | UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode); |
| 659 | UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode); |
| 660 | UnicodeSet rgi("[:RGI_Emoji:]", errorCode); |
| 661 | if (errorCode.errDataIfFailureAndReset("UnicodeSets")) { |
| 662 | return; |
| 663 | } |
| 664 | |
| 665 | // union of all sets except for "rgi" -- should be the same as "rgi" |
| 666 | UnicodeSet all(basic); |
| 667 | all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos); |
| 668 | |
| 669 | UnicodeSet basicOnlyCp(basic); |
| 670 | basicOnlyCp.removeAllStrings(); |
| 671 | |
| 672 | UnicodeSet rgiOnlyCp(rgi); |
| 673 | rgiOnlyCp.removeAllStrings(); |
| 674 | |
| 675 | assertTrue("lots of Basic_Emoji", basic.size() > 1000); |
| 676 | assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size()); |
| 677 | assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600); |
| 678 | assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250); |
| 679 | assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3); |
| 680 | assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300); |
| 681 | assertTrue("lots of RGI_Emoji", rgi.size() > 3000); |
| 682 | |
| 683 | assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000); |
| 684 | assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings()); |
| 685 | assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount()); |
| 686 | assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount()); |
| 687 | assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount()); |
| 688 | assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount()); |
| 689 | assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount()); |
| 690 | |
| 691 | assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000); |
| 692 | assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings()); |
| 693 | assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()", |
| 694 | rgiOnlyCp.size(), basicOnlyCp.size()); |
| 695 | assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp); |
| 696 | assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size()); |
| 697 | assertTrue("RGI_Emoji == union", rgi == all); |
| 698 | |
| 699 | assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F")); |
| 700 | assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"🐿\uFE0F")); |
| 701 | assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)", |
| 702 | keycaps.contains(u"4\uFE0F\u20E3")); |
| 703 | assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u"🇧🇪")); |
| 704 | assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u"🏴")); |
| 705 | assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)", |
| 706 | modified.contains(u"🚴\U0001F3FD")); |
| 707 | assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)", |
| 708 | combos.contains(u"👩\U0001F3FF\u200D✈\uFE0F")); |
| 709 | assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F")); |
| 710 | assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"🐿\uFE0F")); |
| 711 | assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3")); |
| 712 | assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u"🇧🇪")); |
| 713 | assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4")); |
| 714 | assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u"🏴")); |
| 715 | assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u"🚴")); |
| 716 | assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"🚴\U0001F3FD")); |
| 717 | assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"👩\U0001F3FF\u200D✈\uFE0F")); |
| 718 | } |
| 719 | |
| 720 | void UnicodeTest::TestIndicPositionalCategory() { |
| 721 | IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()"); |
| 722 | UnicodeSet na(u"[:InPC=NA:]", errorCode); |
| 723 | assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500); |
| 724 | UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode); |
| 725 | assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100); |
| 726 | assertEquals("U+08FF: NA", U_INPC_NA, |
| 727 | u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY)); |
| 728 | assertEquals("U+0900: Top", U_INPC_TOP, |
| 729 | u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY)); |
| 730 | assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK, |
| 731 | u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY)); |
| 732 | } |
| 733 | |
| 734 | void UnicodeTest::TestIndicSyllabicCategory() { |
| 735 | IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()"); |
| 736 | UnicodeSet other(u"[:InSC=Other:]", errorCode); |
| 737 | assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500); |
| 738 | UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode); |
| 739 | assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100); |
| 740 | assertEquals("U+08FF: Other", U_INSC_OTHER, |
| 741 | u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY)); |
| 742 | assertEquals("U+0900: Bindu", U_INSC_BINDU, |
| 743 | u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY)); |
| 744 | assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER, |
| 745 | u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY)); |
| 746 | } |
| 747 | |
| 748 | void UnicodeTest::TestVerticalOrientation() { |
| 749 | IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()"); |
| 750 | UnicodeSet r(u"[:vo=R:]", errorCode); |
| 751 | assertTrue("mostly R", 0xc0000 <= r.size() && r.size() <= 0xd0000); |
| 752 | UnicodeSet u(u"[:vo=U:]", errorCode); |
| 753 | assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x50000); |
| 754 | UnicodeSet tu(u"[:vo=Tu:]", errorCode); |
| 755 | assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300); |
| 756 | assertEquals("U+0E01: Rotated", U_VO_ROTATED, |
| 757 | u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION)); |
| 758 | assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED, |
| 759 | u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION)); |
| 760 | assertEquals("U+33333: Upright", U_VO_UPRIGHT, |
| 761 | u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION)); |
| 762 | } |
| 763 | |
| 764 | void UnicodeTest::TestDefaultScriptExtensions() { |
| 765 | // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii |
| 766 | // but some of its characters revert to scx=<script> which is usually Common. |
| 767 | IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()"); |
| 768 | UScriptCode scx[20]; |
| 769 | scx[0] = USCRIPT_INVALID_CODE; |
| 770 | assertEquals("U+3000 num scx", 1, // IDEOGRAPHIC SPACE |
| 771 | uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode)); |
| 772 | assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]); |
| 773 | scx[0] = USCRIPT_INVALID_CODE; |
| 774 | assertEquals("U+3012 num scx", 1, // POSTAL MARK |
| 775 | uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode)); |
| 776 | assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]); |
| 777 | } |
| 778 | |
| 779 | void UnicodeTest::TestInvalidCodePointFolding(void) { |
| 780 | // Test behavior when an invalid code point is passed to u_foldCase |
| 781 | static const UChar32 invalidCodePoints[] = { |
| 782 | 0xD800, // lead surrogate |
| 783 | 0xDFFF, // trail surrogate |
| 784 | 0xFDD0, // noncharacter |
| 785 | 0xFFFF, // noncharacter |
| 786 | 0x110000, // out of range |
| 787 | -1 // negative |
| 788 | }; |
| 789 | for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) { |
| 790 | UChar32 cp = invalidCodePoints[i]; |
| 791 | assertEquals("Invalid code points should be echoed back", |
| 792 | cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT)); |
| 793 | assertEquals("Invalid code points should be echoed back", |
| 794 | cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I)); |
| 795 | } |
| 796 | } |
| 797 | |
| 798 | void UnicodeTest::TestBinaryCharacterProperties() { |
| 799 | #if !UCONFIG_NO_NORMALIZATION |
| 800 | IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()"); |
| 801 | // Spot-check getBinaryPropertySet() vs. hasBinaryProperty(). |
| 802 | for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) { |
| 803 | const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode); |
| 804 | if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) { |
| 805 | continue; |
| 806 | } |
| 807 | const UnicodeSet &set = *UnicodeSet::fromUSet(uset); |
| 808 | int32_t count = set.getRangeCount(); |
| 809 | if (count == 0) { |
| 810 | assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")", |
| 811 | u_hasBinaryProperty(0x20, (UProperty)prop)); |
| 812 | assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")", |
| 813 | u_hasBinaryProperty(0x61, (UProperty)prop)); |
| 814 | assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")", |
| 815 | u_hasBinaryProperty(0x4e00, (UProperty)prop)); |
| 816 | } else { |
| 817 | UChar32 c = set.getRangeStart(0); |
| 818 | if (c > 0) { |
| 819 | assertFalse( |
| 820 | UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) + |
| 821 | u", " + prop + u")", |
| 822 | u_hasBinaryProperty(c - 1, (UProperty)prop)); |
| 823 | } |
| 824 | assertTrue( |
| 825 | UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) + |
| 826 | u", " + prop + u")", |
| 827 | u_hasBinaryProperty(c, (UProperty)prop)); |
| 828 | c = set.getRangeEnd(count - 1); |
| 829 | assertTrue( |
| 830 | UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) + |
| 831 | u", " + prop + u")", |
| 832 | u_hasBinaryProperty(c, (UProperty)prop)); |
| 833 | if (c < 0x10ffff) { |
| 834 | assertFalse( |
| 835 | UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) + |
| 836 | u", " + prop + u")", |
| 837 | u_hasBinaryProperty(c + 1, (UProperty)prop)); |
| 838 | } |
| 839 | } |
| 840 | } |
| 841 | #endif |
| 842 | } |
| 843 | |
| 844 | void UnicodeTest::TestIntCharacterProperties() { |
| 845 | #if !UCONFIG_NO_NORMALIZATION |
| 846 | IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()"); |
| 847 | // Spot-check getIntPropertyMap() vs. getIntPropertyValue(). |
| 848 | for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) { |
| 849 | const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode); |
| 850 | if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) { |
| 851 | continue; |
| 852 | } |
| 853 | uint32_t value; |
| 854 | UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value); |
| 855 | assertTrue("int property first range", end >= 0); |
| 856 | UChar32 c = end / 2; |
| 857 | assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c), |
| 858 | u_getIntPropertyValue(c, (UProperty)prop), value); |
| 859 | end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value); |
| 860 | assertTrue("int property later range", end >= 0); |
| 861 | assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end), |
| 862 | u_getIntPropertyValue(end, (UProperty)prop), value); |
| 863 | // ucpmap_get() API coverage |
| 864 | // TODO: move to cucdtst.c |
| 865 | assertEquals( |
| 866 | "int property upcmap_get(U+0061)", |
| 867 | u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61)); |
| 868 | } |
| 869 | #endif |
| 870 | } |