Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /******************************************************************** |
| 4 | * Copyright (c) 2016, International Business Machines Corporation and |
| 5 | * others. All Rights Reserved. |
| 6 | ********************************************************************/ |
| 7 | |
| 8 | |
| 9 | #include "unicode/utypes.h" |
| 10 | |
| 11 | #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING |
| 12 | |
| 13 | #include "rbbimonkeytest.h" |
| 14 | #include "unicode/utypes.h" |
| 15 | #include "unicode/brkiter.h" |
| 16 | #include "unicode/utf16.h" |
| 17 | #include "unicode/uniset.h" |
| 18 | #include "unicode/unistr.h" |
| 19 | |
| 20 | #include "charstr.h" |
| 21 | #include "cmemory.h" |
| 22 | #include "cstr.h" |
| 23 | #include "uelement.h" |
| 24 | #include "uhash.h" |
| 25 | |
| 26 | #include <iostream> |
| 27 | #include <stdio.h> |
| 28 | #include <stdlib.h> |
| 29 | #include <string> |
| 30 | |
| 31 | using namespace icu; |
| 32 | |
| 33 | |
| 34 | void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) { |
| 35 | fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function. |
| 36 | |
| 37 | TESTCASE_AUTO_BEGIN; |
| 38 | TESTCASE_AUTO(testMonkey); |
| 39 | TESTCASE_AUTO_END; |
| 40 | } |
| 41 | |
| 42 | //--------------------------------------------------------------------------------------- |
| 43 | // |
| 44 | // class BreakRule implementation. |
| 45 | // |
| 46 | //--------------------------------------------------------------------------------------- |
| 47 | |
| 48 | BreakRule::BreakRule() // : all field default initialized. |
| 49 | { |
| 50 | } |
| 51 | |
| 52 | BreakRule::~BreakRule() {} |
| 53 | |
| 54 | |
| 55 | //--------------------------------------------------------------------------------------- |
| 56 | // |
| 57 | // class BreakRules implementation. |
| 58 | // |
| 59 | //--------------------------------------------------------------------------------------- |
| 60 | BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) : |
| 61 | fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) { |
| 62 | fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString, |
| 63 | uhash_compareUnicodeString, |
| 64 | NULL, // value comparator. |
| 65 | &status)); |
| 66 | if (U_FAILURE(status)) { |
| 67 | return; |
| 68 | } |
| 69 | uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject); |
| 70 | uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject); |
| 71 | fBreakRules.setDeleter(uprv_deleteUObject); |
| 72 | |
| 73 | fCharClassList.adoptInstead(new UVector(status)); |
| 74 | |
| 75 | fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| 76 | "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:' |
| 77 | // (the identifier is a unicode property name or value) |
| 78 | "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name |
| 79 | 0, status)); |
| 80 | |
| 81 | // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. |
| 82 | fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| 83 | "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';') |
| 84 | "[ \\t]*+" // Match white space. |
| 85 | "(#.*)?+" // Optional # plus whatever follows |
| 86 | "\\R$" // new-line at end of line. |
| 87 | ), 0, status)); |
| 88 | |
| 89 | // Match (initial parse) of a character class definition line. |
| 90 | fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| 91 | "[ \\t]*" // leading white space |
| 92 | "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name |
| 93 | "[ \\t]*=[ \\t]*" // = |
| 94 | "(?<ClassDef>.*?)" // The char class UnicodeSet expression |
| 95 | "[ \\t]*;$"), // ; <end of line> |
| 96 | 0, status)); |
| 97 | |
| 98 | // Match (initial parse) of a break rule line. |
| 99 | fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| 100 | "[ \\t]*" // leading white space |
| 101 | "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name |
| 102 | "[ \\t]*:[ \\t]*" // : |
| 103 | "(?<RuleDef>.*?)" // The rule definition |
| 104 | "[ \\t]*;$"), // ; <end of line> |
| 105 | 0, status)); |
| 106 | |
| 107 | } |
| 108 | |
| 109 | |
| 110 | BreakRules::~BreakRules() {} |
| 111 | |
| 112 | |
| 113 | CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { |
| 114 | |
| 115 | // Create the expanded definition for this char class, |
| 116 | // replacing any set references with the corresponding definition. |
| 117 | |
| 118 | UnicodeString expandedDef; |
| 119 | UnicodeString emptyString; |
| 120 | fSetRefsMatcher->reset(definition); |
| 121 | while (fSetRefsMatcher->find() && U_SUCCESS(status)) { |
| 122 | const UnicodeString name = |
| 123 | fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); |
| 124 | CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name)); |
| 125 | const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; |
| 126 | |
| 127 | fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status); |
| 128 | expandedDef.append(expansionForName); |
| 129 | } |
| 130 | fSetRefsMatcher->appendTail(expandedDef); |
| 131 | |
| 132 | // Verify that the expanded set definition is valid. |
| 133 | |
| 134 | if (fMonkeyImpl->fDumpExpansions) { |
| 135 | printf("epandedDef: %s\n", CStr(expandedDef)()); |
| 136 | } |
| 137 | |
| 138 | LocalPointer<UnicodeSet> s(new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status), status); |
| 139 | if (U_FAILURE(status)) { |
| 140 | IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s\n Expanded set definition: %s", |
| 141 | __FILE__, __LINE__, u_errorName(status), CStr(name)(), CStr(expandedDef)()); |
| 142 | return nullptr; |
| 143 | } |
| 144 | CharClass *cclass = new CharClass(name, definition, expandedDef, s.orphan()); |
| 145 | CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(), |
| 146 | new UnicodeString(name), // Key, owned by hash table. |
| 147 | cclass, // Value, owned by hash table. |
| 148 | &status)); |
| 149 | |
| 150 | if (previousClass != NULL) { |
| 151 | // Duplicate class def. |
| 152 | // These are legitimate, they are adjustments of an existing class. |
| 153 | // TODO: will need to keep the old around when we handle tailorings. |
| 154 | IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)()); |
| 155 | delete previousClass; |
| 156 | } |
| 157 | return cclass; |
| 158 | } |
| 159 | |
| 160 | |
| 161 | void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { |
| 162 | LocalPointer<BreakRule> thisRule(new BreakRule); |
| 163 | thisRule->fName = name; |
| 164 | thisRule->fRule = definition; |
| 165 | |
| 166 | // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes, |
| 167 | // This gives a numeric sort order that matches Unicode UAX rule numbering conventions. |
| 168 | UnicodeString emptyString; |
| 169 | |
| 170 | // Expand the char class definitions within the rule. |
| 171 | fSetRefsMatcher->reset(definition); |
| 172 | while (fSetRefsMatcher->find() && U_SUCCESS(status)) { |
| 173 | const UnicodeString name = |
| 174 | fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); |
| 175 | CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name)); |
| 176 | if (!nameClass) { |
| 177 | IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"", |
| 178 | __FILE__, __LINE__, CStr(name)(), CStr(definition)()); |
| 179 | } |
| 180 | const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; |
| 181 | |
| 182 | fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status); |
| 183 | thisRule->fExpandedRule.append(expansionForName); |
| 184 | } |
| 185 | fSetRefsMatcher->appendTail(thisRule->fExpandedRule); |
| 186 | |
| 187 | // If rule begins with a '^' rule chaining is disallowed. |
| 188 | // Strip off the '^' from the rule expression, and set the flag. |
| 189 | if (thisRule->fExpandedRule.charAt(0) == u'^') { |
| 190 | thisRule->fInitialMatchOnly = true; |
| 191 | thisRule->fExpandedRule.remove(0, 1); |
| 192 | thisRule->fExpandedRule.trim(); |
| 193 | } |
| 194 | |
| 195 | // Replace the divide sign (\u00f7) with a regular expression named capture. |
| 196 | // When running the rules, a match that includes this group means we found a break position. |
| 197 | |
| 198 | int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7); |
| 199 | if (dividePos >= 0) { |
| 200 | thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)")); |
| 201 | } |
| 202 | if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) { |
| 203 | status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message. |
| 204 | } |
| 205 | |
| 206 | // UAX break rule set definitions can be empty, just []. |
| 207 | // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which |
| 208 | // also matches nothing. |
| 209 | |
| 210 | static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0}; |
| 211 | int32_t where = 0; |
| 212 | while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) { |
| 213 | thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]")); |
| 214 | } |
| 215 | if (fMonkeyImpl->fDumpExpansions) { |
| 216 | printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)()); |
| 217 | } |
| 218 | |
| 219 | // Compile a regular expression for this rule. |
| 220 | thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status)); |
| 221 | if (U_FAILURE(status)) { |
| 222 | IntlTest::gTest->errln("%s:%d Error creating regular expression for %s", |
| 223 | __FILE__, __LINE__, CStr(thisRule->fExpandedRule)()); |
| 224 | return; |
| 225 | } |
| 226 | |
| 227 | // Put this new rule into the vector of all Rules. |
| 228 | fBreakRules.adoptElement(thisRule.orphan(), status); |
| 229 | } |
| 230 | |
| 231 | |
| 232 | bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) { |
| 233 | if (keyword == UnicodeString("locale")) { |
| 234 | CharString localeName; |
| 235 | localeName.append(CStr(value)(), -1, status); |
| 236 | fLocale = Locale::createFromName(localeName.data()); |
| 237 | return true; |
| 238 | } |
| 239 | if (keyword == UnicodeString("type")) { |
| 240 | if (value == UnicodeString("grapheme")) { |
| 241 | fType = UBRK_CHARACTER; |
| 242 | } else if (value == UnicodeString("word")) { |
| 243 | fType = UBRK_WORD; |
| 244 | } else if (value == UnicodeString("line")) { |
| 245 | fType = UBRK_LINE; |
| 246 | } else if (value == UnicodeString("sentence")) { |
| 247 | fType = UBRK_SENTENCE; |
| 248 | } else { |
| 249 | IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)()); |
| 250 | } |
| 251 | return true; |
| 252 | } |
| 253 | // TODO: add tailoring base setting here. |
| 254 | return false; |
| 255 | } |
| 256 | |
| 257 | RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) { |
| 258 | if (U_FAILURE(status)) { |
| 259 | return NULL; |
| 260 | } |
| 261 | RuleBasedBreakIterator *bi = NULL; |
| 262 | switch(fType) { |
| 263 | case UBRK_CHARACTER: |
| 264 | bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status)); |
| 265 | break; |
| 266 | case UBRK_WORD: |
| 267 | bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status)); |
| 268 | break; |
| 269 | case UBRK_LINE: |
| 270 | bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status)); |
| 271 | break; |
| 272 | case UBRK_SENTENCE: |
| 273 | bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status)); |
| 274 | break; |
| 275 | default: |
| 276 | IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType); |
| 277 | status = U_ILLEGAL_ARGUMENT_ERROR; |
| 278 | } |
| 279 | return bi; |
| 280 | } |
| 281 | |
| 282 | |
| 283 | void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) { |
| 284 | if (U_FAILURE(status)) { |
| 285 | return; |
| 286 | } |
| 287 | |
| 288 | UnicodeString emptyString; |
| 289 | for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line. |
| 290 | if (U_FAILURE(status)) { |
| 291 | return; |
| 292 | } |
| 293 | int32_t lineLength = 0; |
| 294 | const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status); |
| 295 | if (lineBuf == NULL) { |
| 296 | break; |
| 297 | } |
| 298 | UnicodeString line(lineBuf, lineLength); |
| 299 | |
| 300 | // Strip comment lines. |
| 301 | fCommentsMatcher->reset(line); |
| 302 | line = fCommentsMatcher->replaceFirst(emptyString, status); |
| 303 | if (line.isEmpty()) { |
| 304 | continue; |
| 305 | } |
| 306 | |
| 307 | // Recognize character class definition and keyword lines |
| 308 | fClassDefMatcher->reset(line); |
| 309 | if (fClassDefMatcher->matches(status)) { |
| 310 | UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status); |
| 311 | UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status); |
| 312 | if (fMonkeyImpl->fDumpExpansions) { |
| 313 | printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)()); |
| 314 | } |
| 315 | if (setKeywordParameter(className, classDef, status)) { |
| 316 | // The scanned item was "type = ..." or "locale = ...", etc. |
| 317 | // which are not actual character classes. |
| 318 | continue; |
| 319 | } |
| 320 | addCharClass(className, classDef, status); |
| 321 | continue; |
| 322 | } |
| 323 | |
| 324 | // Recognize rule lines. |
| 325 | fRuleDefMatcher->reset(line); |
| 326 | if (fRuleDefMatcher->matches(status)) { |
| 327 | UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status); |
| 328 | UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status); |
| 329 | if (fMonkeyImpl->fDumpExpansions) { |
| 330 | printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)()); |
| 331 | } |
| 332 | addRule(ruleName, ruleDef, status); |
| 333 | continue; |
| 334 | } |
| 335 | |
| 336 | IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n", |
| 337 | __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)()); |
| 338 | } |
| 339 | |
| 340 | // Build the vector of char classes, omitting the dictionary class if there is one. |
| 341 | // This will be used when constructing the random text to be tested. |
| 342 | |
| 343 | // Also compute the "other" set, consisting of any characters not included in |
| 344 | // one or more of the user defined sets. |
| 345 | |
| 346 | UnicodeSet otherSet((UChar32)0, 0x10ffff); |
| 347 | int32_t pos = UHASH_FIRST; |
| 348 | const UHashElement *el = NULL; |
| 349 | while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) { |
| 350 | const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer); |
| 351 | CharClass *cclass = static_cast<CharClass *>(el->value.pointer); |
| 352 | // printf(" Adding %s\n", CStr(*ccName)()); |
| 353 | if (*ccName != cclass->fName) { |
| 354 | IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n", |
| 355 | __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)()); |
| 356 | } |
| 357 | const UnicodeSet *set = cclass->fSet.getAlias(); |
| 358 | otherSet.removeAll(*set); |
| 359 | if (*ccName == UnicodeString("dictionary")) { |
| 360 | fDictionarySet = *set; |
| 361 | } else { |
| 362 | fCharClassList->addElement(cclass, status); |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | if (!otherSet.isEmpty()) { |
| 367 | // fprintf(stderr, "have an other set.\n"); |
| 368 | UnicodeString pattern; |
| 369 | CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status); |
| 370 | fCharClassList->addElement(cclass, status); |
| 371 | } |
| 372 | } |
| 373 | |
| 374 | |
| 375 | const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const { |
| 376 | int32_t localIter = 0; |
| 377 | int32_t &it = iter? *iter : localIter; |
| 378 | |
| 379 | while (it < fCharClassList->size()) { |
| 380 | const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it)); |
| 381 | ++it; |
| 382 | if (cc->fSet->contains(c)) { |
| 383 | return cc; |
| 384 | } |
| 385 | } |
| 386 | return NULL; |
| 387 | } |
| 388 | |
| 389 | //--------------------------------------------------------------------------------------- |
| 390 | // |
| 391 | // class MonkeyTestData implementation. |
| 392 | // |
| 393 | //--------------------------------------------------------------------------------------- |
| 394 | |
| 395 | void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) { |
| 396 | const int32_t dataLength = 1000; |
| 397 | |
| 398 | // Fill the test string with random characters. |
| 399 | // First randomly pick a char class, then randomly pick a character from that class. |
| 400 | // Exclude any characters from the dictionary set. |
| 401 | |
| 402 | // std::cout << "Populating Test Data" << std::endl; |
| 403 | fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, |
| 404 | // allowing recreation of failing data. |
| 405 | fBkRules = rules; |
| 406 | fString.remove(); |
| 407 | for (int32_t n=0; n<dataLength;) { |
| 408 | int charClassIndex = rand() % rules->fCharClassList->size(); |
| 409 | const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex)); |
| 410 | if (cclass->fSet->size() == 0) { |
| 411 | // Some rules or tailorings do end up with empty char classes. |
| 412 | continue; |
| 413 | } |
| 414 | int32_t charIndex = rand() % cclass->fSet->size(); |
| 415 | UChar32 c = cclass->fSet->charAt(charIndex); |
| 416 | if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) { |
| 417 | // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. |
| 418 | // Don't let random unpaired surrogates combine in the test data because they might |
| 419 | // produce an unwanted dictionary character. |
| 420 | continue; |
| 421 | } |
| 422 | |
| 423 | if (!rules->fDictionarySet.contains(c)) { |
| 424 | fString.append(c); |
| 425 | ++n; |
| 426 | } |
| 427 | } |
| 428 | |
| 429 | // Reset each rule matcher regex with this new string. |
| 430 | // (Although we are always using the same string object, ICU regular expressions |
| 431 | // don't like the underlying string data changing without doing a reset). |
| 432 | |
| 433 | for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) { |
| 434 | BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum)); |
| 435 | rule->fRuleMatcher->reset(fString); |
| 436 | } |
| 437 | |
| 438 | // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays). |
| 439 | // Expected and Actual breaks are one longer than the input string; a non-zero value |
| 440 | // will indicate a boundary preceding that position. |
| 441 | |
| 442 | clearActualBreaks(); |
| 443 | fExpectedBreaks = fActualBreaks; |
| 444 | fRuleForPosition = fActualBreaks; |
| 445 | f2ndRuleForPos = fActualBreaks; |
| 446 | |
| 447 | // Apply reference rules to find the expected breaks. |
| 448 | |
| 449 | fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text. |
| 450 | // ICU always reports a break there. |
| 451 | // The reference rules do not have a means to do so. |
| 452 | int32_t strIdx = 0; |
| 453 | bool initialMatch = true; // True at start of text, and immediately after each boundary, |
| 454 | // for control over rule chaining. |
| 455 | while (strIdx < fString.length()) { |
| 456 | BreakRule *matchingRule = NULL; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 457 | UBool hasBreak = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 458 | int32_t ruleNum = 0; |
| 459 | int32_t matchStart = 0; |
| 460 | int32_t matchEnd = 0; |
| 461 | int32_t breakGroup = 0; |
| 462 | for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) { |
| 463 | BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum)); |
| 464 | if (rule->fInitialMatchOnly && !initialMatch) { |
| 465 | // Skip checking this '^' rule. (No rule chaining) |
| 466 | continue; |
| 467 | } |
| 468 | rule->fRuleMatcher->reset(); |
| 469 | if (rule->fRuleMatcher->lookingAt(strIdx, status)) { |
| 470 | // A candidate rule match, check further to see if we take it or continue to check other rules. |
| 471 | // Matches of zero or one codepoint count only if they also specify a break. |
| 472 | matchStart = rule->fRuleMatcher->start(status); |
| 473 | matchEnd = rule->fRuleMatcher->end(status); |
| 474 | breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status); |
| 475 | hasBreak = U_SUCCESS(status); |
| 476 | if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) { |
| 477 | status = U_ZERO_ERROR; |
| 478 | } |
| 479 | if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) { |
| 480 | matchingRule = rule; |
| 481 | break; |
| 482 | } |
| 483 | } |
| 484 | } |
| 485 | if (matchingRule == NULL) { |
| 486 | // No reference rule matched. This is an error in the rules that should never happen. |
| 487 | IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ", |
| 488 | __FILE__, __LINE__, strIdx); |
| 489 | dump(strIdx); |
| 490 | status = U_INVALID_FORMAT_ERROR; |
| 491 | return; |
| 492 | } |
| 493 | if (matchingRule->fRuleMatcher->group(status).length() == 0) { |
| 494 | // Zero length rule match. This is also an error in the rule expressions. |
| 495 | IntlTest::gTest->errln("%s:%d Zero length rule match.", |
| 496 | __FILE__, __LINE__); |
| 497 | status = U_INVALID_FORMAT_ERROR; |
| 498 | return; |
| 499 | } |
| 500 | |
| 501 | // Record which rule matched over the length of the match. |
| 502 | for (int i = matchStart; i < matchEnd; i++) { |
| 503 | if (fRuleForPosition.charAt(i) == 0) { |
| 504 | fRuleForPosition.setCharAt(i, (UChar)ruleNum); |
| 505 | } else { |
| 506 | f2ndRuleForPos.setCharAt(i, (UChar)ruleNum); |
| 507 | } |
| 508 | } |
| 509 | |
| 510 | // Break positions appear in rules as a matching named capture of zero length at the break position, |
| 511 | // the adjusted pattern contains (?<BreakPosition>) |
| 512 | if (hasBreak) { |
| 513 | int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status); |
| 514 | if (U_FAILURE(status) || breakPos < 0) { |
| 515 | // Rule specified a break, but that break wasn't part of the match, even |
| 516 | // though the rule as a whole matched. |
| 517 | // Can't happen with regular expressions derived from (equivalent to) ICU break rules. |
| 518 | // Shouldn't get here. |
| 519 | IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__); |
| 520 | status = U_INVALID_FORMAT_ERROR; |
| 521 | break; |
| 522 | } |
| 523 | fExpectedBreaks.setCharAt(breakPos, (UChar)1); |
| 524 | // printf("recording break at %d\n", breakPos); |
| 525 | // For the next iteration, pick up applying rules immediately after the break, |
| 526 | // which may differ from end of the match. The matching rule may have included |
| 527 | // context following the boundary that needs to be looked at again. |
| 528 | strIdx = matchingRule->fRuleMatcher->end(breakGroup, status); |
| 529 | initialMatch = true; |
| 530 | } else { |
| 531 | // Original rule didn't specify a break. |
| 532 | // Continue applying rules starting on the last code point of this match. |
| 533 | strIdx = fString.moveIndex32(matchEnd, -1); |
| 534 | initialMatch = false; |
| 535 | if (strIdx == matchStart) { |
| 536 | // Match was only one code point, no progress if we continue. |
| 537 | // Shouldn't get here, case is filtered out at top of loop. |
| 538 | CharString ruleName; |
| 539 | ruleName.appendInvariantChars(matchingRule->fName, status); |
| 540 | IntlTest::gTest->errln("%s:%d Rule %s internal error", |
| 541 | __FILE__, __LINE__, ruleName.data()); |
| 542 | status = U_INVALID_FORMAT_ERROR; |
| 543 | break; |
| 544 | } |
| 545 | } |
| 546 | if (U_FAILURE(status)) { |
| 547 | IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.", |
| 548 | __FILE__, __LINE__, u_errorName(status)); |
| 549 | break; |
| 550 | } |
| 551 | } |
| 552 | } |
| 553 | |
| 554 | void MonkeyTestData::clearActualBreaks() { |
| 555 | fActualBreaks.remove(); |
| 556 | // Actual Breaks length is one longer than the data string length, allowing |
| 557 | // for breaks before the first and after the last character in the data. |
| 558 | for (int32_t i=0; i<=fString.length(); i++) { |
| 559 | fActualBreaks.append((UChar)0); |
| 560 | } |
| 561 | } |
| 562 | |
| 563 | void MonkeyTestData::dump(int32_t around) const { |
| 564 | printf("\n" |
| 565 | " char break Rule Character\n" |
| 566 | " pos code class R I name name\n" |
| 567 | "---------------------------------------------------------------------------------------------\n"); |
| 568 | |
| 569 | int32_t start; |
| 570 | int32_t end; |
| 571 | |
| 572 | if (around == -1) { |
| 573 | start = 0; |
| 574 | end = fString.length(); |
| 575 | } else { |
| 576 | // Display context around a failure. |
| 577 | start = fString.moveIndex32(around, -30); |
| 578 | end = fString.moveIndex32(around, +30); |
| 579 | } |
| 580 | |
| 581 | for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) { |
| 582 | UErrorCode status = U_ZERO_ERROR; |
| 583 | UChar32 c = fString.char32At(charIdx); |
| 584 | const CharClass *cc = fBkRules->getClassForChar(c); |
| 585 | CharString ccName; |
| 586 | ccName.appendInvariantChars(cc->fName, status); |
| 587 | CharString ruleName, secondRuleName; |
| 588 | const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx))); |
| 589 | ruleName.appendInvariantChars(rule->fName, status); |
| 590 | if (f2ndRuleForPos.charAt(charIdx) > 0) { |
| 591 | const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx))); |
| 592 | secondRuleName.appendInvariantChars(secondRule->fName, status); |
| 593 | } |
| 594 | char cName[200]; |
| 595 | u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status); |
| 596 | |
| 597 | printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n", |
| 598 | charIdx, c, ccName.data(), |
| 599 | fExpectedBreaks.charAt(charIdx) ? '*' : '.', |
| 600 | fActualBreaks.charAt(charIdx) ? '*' : '.', |
| 601 | ruleName.data(), secondRuleName.data(), cName |
| 602 | ); |
| 603 | } |
| 604 | } |
| 605 | |
| 606 | |
| 607 | //--------------------------------------------------------------------------------------- |
| 608 | // |
| 609 | // class RBBIMonkeyImpl |
| 610 | // |
| 611 | //--------------------------------------------------------------------------------------- |
| 612 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 613 | RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(false), fThread(this) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 614 | (void)status; // suppress unused parameter compiler warning. |
| 615 | } |
| 616 | |
| 617 | |
| 618 | // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the |
| 619 | // reference rules and creating the icu breakiterator to test, |
| 620 | // with its type and locale coming from the reference rules. |
| 621 | |
| 622 | void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) { |
| 623 | fRuleFileName = ruleFile; |
| 624 | openBreakRules(ruleFile, status); |
| 625 | if (U_FAILURE(status)) { |
| 626 | IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); |
| 627 | return; |
| 628 | } |
| 629 | fRuleSet.adoptInstead(new BreakRules(this, status)); |
| 630 | fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status); |
| 631 | if (U_FAILURE(status)) { |
| 632 | IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); |
| 633 | return; |
| 634 | } |
| 635 | fBI.adoptInstead(fRuleSet->createICUBreakIterator(status)); |
| 636 | fTestData.adoptInstead(new MonkeyTestData()); |
| 637 | } |
| 638 | |
| 639 | |
| 640 | RBBIMonkeyImpl::~RBBIMonkeyImpl() { |
| 641 | } |
| 642 | |
| 643 | |
| 644 | void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) { |
| 645 | CharString path; |
| 646 | path.append(IntlTest::getSourceTestData(status), status); |
| 647 | path.append("break_rules" U_FILE_SEP_STRING, status); |
| 648 | path.appendPathPart(fileName, status); |
| 649 | const char *codePage = "UTF-8"; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 650 | fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, true, false, &status)); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 651 | } |
| 652 | |
| 653 | |
| 654 | void RBBIMonkeyImpl::startTest() { |
| 655 | fThread.start(); // invokes runTest() in a separate thread. |
| 656 | } |
| 657 | |
| 658 | void RBBIMonkeyImpl::join() { |
| 659 | fThread.join(); |
| 660 | } |
| 661 | |
| 662 | |
| 663 | #define MONKEY_ERROR(msg, index) UPRV_BLOCK_MACRO_BEGIN { \ |
| 664 | IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \ |
| 665 | __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \ |
| 666 | if (fVerbose) { fTestData->dump(index); } \ |
| 667 | status = U_INVALID_STATE_ERROR; \ |
| 668 | } UPRV_BLOCK_MACRO_END |
| 669 | |
| 670 | void RBBIMonkeyImpl::runTest() { |
| 671 | UErrorCode status = U_ZERO_ERROR; |
| 672 | int32_t errorCount = 0; |
| 673 | for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { |
| 674 | status = U_ZERO_ERROR; |
| 675 | fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status); |
| 676 | if (fBI.isNull()) { |
| 677 | IntlTest::gTest->dataerrln("Unable to run test because fBI is null."); |
| 678 | return; |
| 679 | } |
| 680 | // fTestData->dump(); |
| 681 | testForwards(status); |
| 682 | testPrevious(status); |
| 683 | testFollowing(status); |
| 684 | testPreceding(status); |
| 685 | testIsBoundary(status); |
| 686 | testIsBoundaryRandom(status); |
| 687 | |
| 688 | if (fLoopCount < 0 && loopCount % 100 == 0) { |
| 689 | fprintf(stderr, "."); |
| 690 | } |
| 691 | if (U_FAILURE(status)) { |
| 692 | if (++errorCount > 10) { |
| 693 | return; |
| 694 | } |
| 695 | } |
| 696 | } |
| 697 | } |
| 698 | |
| 699 | void RBBIMonkeyImpl::testForwards(UErrorCode &status) { |
| 700 | if (U_FAILURE(status)) { |
| 701 | return; |
| 702 | } |
| 703 | fTestData->clearActualBreaks(); |
| 704 | fBI->setText(fTestData->fString); |
| 705 | int32_t previousBreak = -2; |
| 706 | for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) { |
| 707 | if (bk <= previousBreak) { |
| 708 | MONKEY_ERROR("Break Iterator Stall", bk); |
| 709 | return; |
| 710 | } |
| 711 | if (bk < 0 || bk > fTestData->fString.length()) { |
| 712 | MONKEY_ERROR("Boundary out of bounds", bk); |
| 713 | return; |
| 714 | } |
| 715 | fTestData->fActualBreaks.setCharAt(bk, 1); |
| 716 | } |
| 717 | checkResults("testForwards", FORWARD, status); |
| 718 | } |
| 719 | |
| 720 | void RBBIMonkeyImpl::testFollowing(UErrorCode &status) { |
| 721 | if (U_FAILURE(status)) { |
| 722 | return; |
| 723 | } |
| 724 | fTestData->clearActualBreaks(); |
| 725 | fBI->setText(fTestData->fString); |
| 726 | int32_t nextBreak = -1; |
| 727 | for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) { |
| 728 | int32_t bk = fBI->following(i); |
| 729 | if (bk == BreakIterator::DONE && i == fTestData->fString.length()) { |
| 730 | continue; |
| 731 | } |
| 732 | if (bk == nextBreak && bk > i) { |
| 733 | // i is in the gap between two breaks. |
| 734 | continue; |
| 735 | } |
| 736 | if (i == nextBreak && bk > nextBreak) { |
| 737 | fTestData->fActualBreaks.setCharAt(bk, 1); |
| 738 | nextBreak = bk; |
| 739 | continue; |
| 740 | } |
| 741 | MONKEY_ERROR("following(i)", i); |
| 742 | return; |
| 743 | } |
| 744 | checkResults("testFollowing", FORWARD, status); |
| 745 | } |
| 746 | |
| 747 | |
| 748 | |
| 749 | void RBBIMonkeyImpl::testPrevious(UErrorCode &status) { |
| 750 | if (U_FAILURE(status)) {return;} |
| 751 | |
| 752 | fTestData->clearActualBreaks(); |
| 753 | fBI->setText(fTestData->fString); |
| 754 | int32_t previousBreak = INT32_MAX; |
| 755 | for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) { |
| 756 | if (bk >= previousBreak) { |
| 757 | MONKEY_ERROR("Break Iterator Stall", bk); |
| 758 | return; |
| 759 | } |
| 760 | if (bk < 0 || bk > fTestData->fString.length()) { |
| 761 | MONKEY_ERROR("Boundary out of bounds", bk); |
| 762 | return; |
| 763 | } |
| 764 | fTestData->fActualBreaks.setCharAt(bk, 1); |
| 765 | } |
| 766 | checkResults("testPrevius", REVERSE, status); |
| 767 | } |
| 768 | |
| 769 | |
| 770 | void RBBIMonkeyImpl::testPreceding(UErrorCode &status) { |
| 771 | if (U_FAILURE(status)) { |
| 772 | return; |
| 773 | } |
| 774 | fTestData->clearActualBreaks(); |
| 775 | fBI->setText(fTestData->fString); |
| 776 | int32_t nextBreak = fTestData->fString.length()+1; |
| 777 | for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) { |
| 778 | int32_t bk = fBI->preceding(i); |
| 779 | // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); |
| 780 | if (bk == BreakIterator::DONE && i == 0) { |
| 781 | continue; |
| 782 | } |
| 783 | if (bk == nextBreak && bk < i) { |
| 784 | // i is in the gap between two breaks. |
| 785 | continue; |
| 786 | } |
| 787 | if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) { |
| 788 | // i indexes to a trailing surrogate. |
| 789 | // Break Iterators treat an index to either half as referring to the supplemental code point, |
| 790 | // with preceding going to some preceding code point. |
| 791 | if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) { |
| 792 | MONKEY_ERROR("preceding of trailing surrogate error", i); |
| 793 | } |
| 794 | continue; |
| 795 | } |
| 796 | if (i == nextBreak && bk < nextBreak) { |
| 797 | fTestData->fActualBreaks.setCharAt(bk, 1); |
| 798 | nextBreak = bk; |
| 799 | continue; |
| 800 | } |
| 801 | MONKEY_ERROR("preceding(i)", i); |
| 802 | return; |
| 803 | } |
| 804 | checkResults("testPreceding", REVERSE, status); |
| 805 | } |
| 806 | |
| 807 | |
| 808 | void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) { |
| 809 | if (U_FAILURE(status)) { |
| 810 | return; |
| 811 | } |
| 812 | fTestData->clearActualBreaks(); |
| 813 | fBI->setText(fTestData->fString); |
| 814 | for (int i=fTestData->fString.length(); i>=0; --i) { |
| 815 | if (fBI->isBoundary(i)) { |
| 816 | fTestData->fActualBreaks.setCharAt(i, 1); |
| 817 | } |
| 818 | } |
| 819 | checkResults("testForwards", FORWARD, status); |
| 820 | } |
| 821 | |
| 822 | void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) { |
| 823 | if (U_FAILURE(status)) { |
| 824 | return; |
| 825 | } |
| 826 | fBI->setText(fTestData->fString); |
| 827 | |
| 828 | int stringLen = fTestData->fString.length(); |
| 829 | for (int i=stringLen; i>=0; --i) { |
| 830 | int strIdx = fRandomGenerator() % stringLen; |
| 831 | if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) { |
| 832 | IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", |
| 833 | __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed); |
| 834 | if (fVerbose) { |
| 835 | fTestData->dump(i); |
| 836 | } |
| 837 | status = U_INVALID_STATE_ERROR; |
| 838 | break; |
| 839 | } |
| 840 | } |
| 841 | } |
| 842 | |
| 843 | |
| 844 | |
| 845 | void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) { |
| 846 | if (U_FAILURE(status)) { |
| 847 | return; |
| 848 | } |
| 849 | if (direction == FORWARD) { |
| 850 | for (int i=0; i<=fTestData->fString.length(); ++i) { |
| 851 | if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { |
| 852 | IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", |
| 853 | __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); |
| 854 | if (fVerbose) { |
| 855 | fTestData->dump(i); |
| 856 | } |
| 857 | status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely |
| 858 | break; // produce many redundant errors. |
| 859 | } |
| 860 | } |
| 861 | } else { |
| 862 | for (int i=fTestData->fString.length(); i>=0; i--) { |
| 863 | if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { |
| 864 | IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", |
| 865 | __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); |
| 866 | if (fVerbose) { |
| 867 | fTestData->dump(i); |
| 868 | } |
| 869 | status = U_INVALID_STATE_ERROR; |
| 870 | break; |
| 871 | } |
| 872 | } |
| 873 | } |
| 874 | } |
| 875 | |
| 876 | |
| 877 | |
| 878 | //--------------------------------------------------------------------------------------- |
| 879 | // |
| 880 | // class RBBIMonkeyTest implementation. |
| 881 | // |
| 882 | //--------------------------------------------------------------------------------------- |
| 883 | RBBIMonkeyTest::RBBIMonkeyTest() { |
| 884 | } |
| 885 | |
| 886 | RBBIMonkeyTest::~RBBIMonkeyTest() { |
| 887 | } |
| 888 | |
| 889 | |
| 890 | // params, taken from this->fParams. |
| 891 | // rules=file_name Name of file containing the reference rules. |
| 892 | // seed=nnnnn Random number starting seed. |
| 893 | // Setting the seed allows errors to be reproduced. |
| 894 | // loop=nnn Looping count. Controls running time. |
| 895 | // -1: run forever. |
| 896 | // 0 or greater: run length. |
| 897 | // expansions debug option, show expansions of rules and sets. |
| 898 | // verbose Display details of the failure. |
| 899 | // |
| 900 | // Parameters on the intltest command line follow the test name, and are preceded by '@'. |
| 901 | // For example, |
| 902 | // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1 |
| 903 | // |
| 904 | void RBBIMonkeyTest::testMonkey() { |
| 905 | // printf("Test parameters: %s\n", fParams); |
| 906 | UnicodeString params(fParams); |
| 907 | UErrorCode status = U_ZERO_ERROR; |
| 908 | |
| 909 | const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt", |
| 910 | "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt", |
| 911 | NULL }; |
| 912 | CharString testNameFromParams; |
| 913 | if (getStringParam("rules", params, testNameFromParams, status)) { |
| 914 | tests[0] = testNameFromParams.data(); |
| 915 | tests[1] = NULL; |
| 916 | } |
| 917 | |
| 918 | int64_t loopCount = quick? 100 : 5000; |
| 919 | getIntParam("loop", params, loopCount, status); |
| 920 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 921 | UBool dumpExpansions = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 922 | getBoolParam("expansions", params, dumpExpansions, status); |
| 923 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 924 | UBool verbose = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 925 | getBoolParam("verbose", params, verbose, status); |
| 926 | |
| 927 | int64_t seed = 0; |
| 928 | getIntParam("seed", params, seed, status); |
| 929 | |
| 930 | if (params.length() != 0) { |
| 931 | // Options processing did not consume all of the parameters. Something unrecognized was present. |
| 932 | CharString unrecognizedParameters; |
| 933 | unrecognizedParameters.append(CStr(params)(), -1, status); |
| 934 | errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data()); |
| 935 | return; |
| 936 | } |
| 937 | |
| 938 | UVector startedTests(status); |
| 939 | if (U_FAILURE(status)) { |
| 940 | errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status)); |
| 941 | return; |
| 942 | } |
| 943 | |
| 944 | // Monkey testing is multi-threaded. |
| 945 | // Each set of break rules to be tested is run in a separate thread. |
| 946 | // Each thread/set of rules gets a separate RBBIMonkeyImpl object. |
| 947 | int32_t i; |
| 948 | for (i=0; tests[i] != NULL; ++i) { |
| 949 | logln("beginning testing of %s", tests[i]); |
| 950 | LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status)); |
| 951 | if (U_FAILURE(status)) { |
| 952 | dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); |
| 953 | break; |
| 954 | } |
| 955 | test->fDumpExpansions = dumpExpansions; |
| 956 | test->fVerbose = verbose; |
| 957 | test->fRandomGenerator.seed(static_cast<uint32_t>(seed)); |
| 958 | test->fLoopCount = static_cast<int32_t>(loopCount); |
| 959 | test->setup(tests[i], status); |
| 960 | if (U_FAILURE(status)) { |
| 961 | dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); |
| 962 | break; |
| 963 | } |
| 964 | test->startTest(); |
| 965 | startedTests.addElement(test.orphan(), status); |
| 966 | if (U_FAILURE(status)) { |
| 967 | errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); |
| 968 | break; |
| 969 | } |
| 970 | } |
| 971 | |
| 972 | for (i=0; i<startedTests.size(); ++i) { |
| 973 | RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i)); |
| 974 | test->join(); |
| 975 | delete test; |
| 976 | } |
| 977 | } |
| 978 | |
| 979 | |
| 980 | UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) { |
| 981 | name.append(" *= *(-?\\d+) *,? *"); |
| 982 | RegexMatcher m(name, params, 0, status); |
| 983 | if (m.find()) { |
| 984 | // The param exists. Convert the string to an int. |
| 985 | CharString str; |
| 986 | str.append(CStr(m.group(1, status))(), -1, status); |
| 987 | val = strtol(str.data(), NULL, 10); |
| 988 | |
| 989 | // Delete this parameter from the params string. |
| 990 | m.reset(); |
| 991 | params = m.replaceFirst(UnicodeString(), status); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 992 | return true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 993 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 994 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 995 | } |
| 996 | |
| 997 | UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) { |
| 998 | name.append(" *= *([^ ,]*) *,? *"); |
| 999 | RegexMatcher m(name, params, 0, status); |
| 1000 | if (m.find()) { |
| 1001 | // The param exists. |
| 1002 | dest.append(CStr(m.group(1, status))(), -1, status); |
| 1003 | |
| 1004 | // Delete this parameter from the params string. |
| 1005 | m.reset(); |
| 1006 | params = m.replaceFirst(UnicodeString(), status); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1007 | return true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1008 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1009 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1010 | } |
| 1011 | |
| 1012 | UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) { |
| 1013 | name.append("(?: *= *(true|false))? *,? *"); |
| 1014 | RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status); |
| 1015 | if (m.find()) { |
| 1016 | if (m.start(1, status) > 0) { |
| 1017 | // user option included a value. |
| 1018 | dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0; |
| 1019 | } else { |
| 1020 | // No explicit user value, implies true. |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1021 | dest = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1022 | } |
| 1023 | |
| 1024 | // Delete this parameter from the params string. |
| 1025 | m.reset(); |
| 1026 | params = m.replaceFirst(UnicodeString(), status); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1027 | return true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1028 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1029 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1030 | } |
| 1031 | |
| 1032 | #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */ |