Blame - source/test/intltest/rbbimonkeytest.cpp - chromium.googlesource.com/chromium/deps/icu

blob: c5648442977ea5614dc438d0738b27e075d220bd [file] [log] [blame]

Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/********************************************************************
				4	* Copyright (c) 2016, International Business Machines Corporation and
				5	* others. All Rights Reserved.
				6	********************************************************************/
				7
				8
				9	#include "unicode/utypes.h"
				10
				11	#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
				12
				13	#include "rbbimonkeytest.h"
				14	#include "unicode/utypes.h"
				15	#include "unicode/brkiter.h"
				16	#include "unicode/utf16.h"
				17	#include "unicode/uniset.h"
				18	#include "unicode/unistr.h"
				19
				20	#include "charstr.h"
				21	#include "cmemory.h"
				22	#include "cstr.h"
				23	#include "uelement.h"
				24	#include "uhash.h"
				25
				26	#include <iostream>
				27	#include <stdio.h>
				28	#include <stdlib.h>
				29	#include <string>
				30
				31	using namespace icu;
				32
				33
				34	void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
				35	fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
				36
				37	TESTCASE_AUTO_BEGIN;
				38	TESTCASE_AUTO(testMonkey);
				39	TESTCASE_AUTO_END;
				40	}
				41
				42	//---------------------------------------------------------------------------------------
				43	//
				44	// class BreakRule implementation.
				45	//
				46	//---------------------------------------------------------------------------------------
				47
				48	BreakRule::BreakRule() // : all field default initialized.
				49	{
				50	}
				51
				52	BreakRule::~BreakRule() {}
				53
				54
				55	//---------------------------------------------------------------------------------------
				56	//
				57	// class BreakRules implementation.
				58	//
				59	//---------------------------------------------------------------------------------------
				60	BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
				61	fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
				62	fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
				63	uhash_compareUnicodeString,
				64	NULL, // value comparator.
				65	&status));
				66	if (U_FAILURE(status)) {
				67	return;
				68	}
				69	uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
				70	uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
				71	fBreakRules.setDeleter(uprv_deleteUObject);
				72
				73	fCharClassList.adoptInstead(new UVector(status));
				74
				75	fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
				76	"(?!(?:\\{\|=\|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
				77	// (the identifier is a unicode property name or value)
				78	"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
				79	0, status));
				80
				81	// Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
				82	fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
				83	"(^\|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
				84	"[ \\t]*+" // Match white space.
				85	"(#.*)?+" // Optional # plus whatever follows
				86	"\\R$" // new-line at end of line.
				87	), 0, status));
				88
				89	// Match (initial parse) of a character class definition line.
				90	fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
				91	"[ \\t]*" // leading white space
				92	"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
				93	"[ \\t]=[ \\t]" // =
				94	"(?<ClassDef>.*?)" // The char class UnicodeSet expression
				95	"[ \\t]*;$"), // ; <end of line>
				96	0, status));
				97
				98	// Match (initial parse) of a break rule line.
				99	fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
				100	"[ \\t]*" // leading white space
				101	"(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
				102	"[ \\t]:[ \\t]" // :
				103	"(?<RuleDef>.*?)" // The rule definition
				104	"[ \\t]*;$"), // ; <end of line>
				105	0, status));
				106
				107	}
				108
				109
				110	BreakRules::~BreakRules() {}
				111
				112
				113	CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
				114
				115	// Create the expanded definition for this char class,
				116	// replacing any set references with the corresponding definition.
				117
				118	UnicodeString expandedDef;
				119	UnicodeString emptyString;
				120	fSetRefsMatcher->reset(definition);
				121	while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
				122	const UnicodeString name =
				123	fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
				124	CharClass nameClass = static_cast<CharClass >(uhash_get(fCharClasses.getAlias(), &name));
				125	const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
				126
				127	fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
				128	expandedDef.append(expansionForName);
				129	}
				130	fSetRefsMatcher->appendTail(expandedDef);
				131
				132	// Verify that the expanded set definition is valid.
				133
				134	if (fMonkeyImpl->fDumpExpansions) {
				135	printf("epandedDef: %s\n", CStr(expandedDef)());
				136	}
				137
				138	LocalPointer<UnicodeSet> s(new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status), status);
				139	if (U_FAILURE(status)) {
				140	IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s\n Expanded set definition: %s",
				141	__FILE__, __LINE__, u_errorName(status), CStr(name)(), CStr(expandedDef)());
				142	return nullptr;
				143	}
				144	CharClass *cclass = new CharClass(name, definition, expandedDef, s.orphan());
				145	CharClass previousClass = static_cast<CharClass >(uhash_put(fCharClasses.getAlias(),
				146	new UnicodeString(name), // Key, owned by hash table.
				147	cclass, // Value, owned by hash table.
				148	&status));
				149
				150	if (previousClass != NULL) {
				151	// Duplicate class def.
				152	// These are legitimate, they are adjustments of an existing class.
				153	// TODO: will need to keep the old around when we handle tailorings.
				154	IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
				155	delete previousClass;
				156	}
				157	return cclass;
				158	}
				159
				160
				161	void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
				162	LocalPointer<BreakRule> thisRule(new BreakRule);
				163	thisRule->fName = name;
				164	thisRule->fRule = definition;
				165
				166	// If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
				167	// This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
				168	UnicodeString emptyString;
				169
				170	// Expand the char class definitions within the rule.
				171	fSetRefsMatcher->reset(definition);
				172	while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
				173	const UnicodeString name =
				174	fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
				175	CharClass nameClass = static_cast<CharClass >(uhash_get(fCharClasses.getAlias(), &name));
				176	if (!nameClass) {
				177	IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
				178	__FILE__, __LINE__, CStr(name)(), CStr(definition)());
				179	}
				180	const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
				181
				182	fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
				183	thisRule->fExpandedRule.append(expansionForName);
				184	}
				185	fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
				186
				187	// If rule begins with a '^' rule chaining is disallowed.
				188	// Strip off the '^' from the rule expression, and set the flag.
				189	if (thisRule->fExpandedRule.charAt(0) == u'^') {
				190	thisRule->fInitialMatchOnly = true;
				191	thisRule->fExpandedRule.remove(0, 1);
				192	thisRule->fExpandedRule.trim();
				193	}
				194
				195	// Replace the divide sign (\u00f7) with a regular expression named capture.
				196	// When running the rules, a match that includes this group means we found a break position.
				197
				198	int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
				199	if (dividePos >= 0) {
				200	thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
				201	}
				202	if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
				203	status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
				204	}
				205
				206	// UAX break rule set definitions can be empty, just [].
				207	// Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
				208	// also matches nothing.
				209
				210	static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
				211	int32_t where = 0;
				212	while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
				213	thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
				214	}
				215	if (fMonkeyImpl->fDumpExpansions) {
				216	printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
				217	}
				218
				219	// Compile a regular expression for this rule.
				220	thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS \| UREGEX_DOTALL, status));
				221	if (U_FAILURE(status)) {
				222	IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
				223	__FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
				224	return;
				225	}
				226
				227	// Put this new rule into the vector of all Rules.
				228	fBreakRules.adoptElement(thisRule.orphan(), status);
				229	}
				230
				231
				232	bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
				233	if (keyword == UnicodeString("locale")) {
				234	CharString localeName;
				235	localeName.append(CStr(value)(), -1, status);
				236	fLocale = Locale::createFromName(localeName.data());
				237	return true;
				238	}
				239	if (keyword == UnicodeString("type")) {
				240	if (value == UnicodeString("grapheme")) {
				241	fType = UBRK_CHARACTER;
				242	} else if (value == UnicodeString("word")) {
				243	fType = UBRK_WORD;
				244	} else if (value == UnicodeString("line")) {
				245	fType = UBRK_LINE;
				246	} else if (value == UnicodeString("sentence")) {
				247	fType = UBRK_SENTENCE;
				248	} else {
				249	IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
				250	}
				251	return true;
				252	}
				253	// TODO: add tailoring base setting here.
				254	return false;
				255	}
				256
				257	RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
				258	if (U_FAILURE(status)) {
				259	return NULL;
				260	}
				261	RuleBasedBreakIterator *bi = NULL;
				262	switch(fType) {
				263	case UBRK_CHARACTER:
				264	bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
				265	break;
				266	case UBRK_WORD:
				267	bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
				268	break;
				269	case UBRK_LINE:
				270	bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
				271	break;
				272	case UBRK_SENTENCE:
				273	bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
				274	break;
				275	default:
				276	IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
				277	status = U_ILLEGAL_ARGUMENT_ERROR;
				278	}
				279	return bi;
				280	}
				281
				282
				283	void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
				284	if (U_FAILURE(status)) {
				285	return;
				286	}
				287
				288	UnicodeString emptyString;
				289	for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
				290	if (U_FAILURE(status)) {
				291	return;
				292	}
				293	int32_t lineLength = 0;
				294	const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
				295	if (lineBuf == NULL) {
				296	break;
				297	}
				298	UnicodeString line(lineBuf, lineLength);
				299
				300	// Strip comment lines.
				301	fCommentsMatcher->reset(line);
				302	line = fCommentsMatcher->replaceFirst(emptyString, status);
				303	if (line.isEmpty()) {
				304	continue;
				305	}
				306
				307	// Recognize character class definition and keyword lines
				308	fClassDefMatcher->reset(line);
				309	if (fClassDefMatcher->matches(status)) {
				310	UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
				311	UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
				312	if (fMonkeyImpl->fDumpExpansions) {
				313	printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
				314	}
				315	if (setKeywordParameter(className, classDef, status)) {
				316	// The scanned item was "type = ..." or "locale = ...", etc.
				317	// which are not actual character classes.
				318	continue;
				319	}
				320	addCharClass(className, classDef, status);
				321	continue;
				322	}
				323
				324	// Recognize rule lines.
				325	fRuleDefMatcher->reset(line);
				326	if (fRuleDefMatcher->matches(status)) {
				327	UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
				328	UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
				329	if (fMonkeyImpl->fDumpExpansions) {
				330	printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
				331	}
				332	addRule(ruleName, ruleDef, status);
				333	continue;
				334	}
				335
				336	IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
				337	__FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
				338	}
				339
				340	// Build the vector of char classes, omitting the dictionary class if there is one.
				341	// This will be used when constructing the random text to be tested.
				342
				343	// Also compute the "other" set, consisting of any characters not included in
				344	// one or more of the user defined sets.
				345
				346	UnicodeSet otherSet((UChar32)0, 0x10ffff);
				347	int32_t pos = UHASH_FIRST;
				348	const UHashElement *el = NULL;
				349	while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
				350	const UnicodeString ccName = static_cast<const UnicodeString >(el->key.pointer);
				351	CharClass cclass = static_cast<CharClass >(el->value.pointer);
				352	// printf(" Adding %s\n", CStr(*ccName)());
				353	if (*ccName != cclass->fName) {
				354	IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
				355	__FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
				356	}
				357	const UnicodeSet *set = cclass->fSet.getAlias();
				358	otherSet.removeAll(*set);
				359	if (*ccName == UnicodeString("dictionary")) {
				360	fDictionarySet = *set;
				361	} else {
				362	fCharClassList->addElement(cclass, status);
				363	}
				364	}
				365
				366	if (!otherSet.isEmpty()) {
				367	// fprintf(stderr, "have an other set.\n");
				368	UnicodeString pattern;
				369	CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
				370	fCharClassList->addElement(cclass, status);
				371	}
				372	}
				373
				374
				375	const CharClass BreakRules::getClassForChar(UChar32 c, int32_t iter) const {
				376	int32_t localIter = 0;
				377	int32_t &it = iter? *iter : localIter;
				378
				379	while (it < fCharClassList->size()) {
				380	const CharClass cc = static_cast<const CharClass >(fCharClassList->elementAt(it));
				381	++it;
				382	if (cc->fSet->contains(c)) {
				383	return cc;
				384	}
				385	}
				386	return NULL;
				387	}
				388
				389	//---------------------------------------------------------------------------------------
				390	//
				391	// class MonkeyTestData implementation.
				392	//
				393	//---------------------------------------------------------------------------------------
				394
				395	void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
				396	const int32_t dataLength = 1000;
				397
				398	// Fill the test string with random characters.
				399	// First randomly pick a char class, then randomly pick a character from that class.
				400	// Exclude any characters from the dictionary set.
				401
				402	// std::cout << "Populating Test Data" << std::endl;
				403	fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
				404	// allowing recreation of failing data.
				405	fBkRules = rules;
				406	fString.remove();
				407	for (int32_t n=0; n<dataLength;) {
				408	int charClassIndex = rand() % rules->fCharClassList->size();
				409	const CharClass cclass = static_cast<CharClass >(rules->fCharClassList->elementAt(charClassIndex));
				410	if (cclass->fSet->size() == 0) {
				411	// Some rules or tailorings do end up with empty char classes.
				412	continue;
				413	}
				414	int32_t charIndex = rand() % cclass->fSet->size();
				415	UChar32 c = cclass->fSet->charAt(charIndex);
				416	if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
				417	// Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
				418	// Don't let random unpaired surrogates combine in the test data because they might
				419	// produce an unwanted dictionary character.
				420	continue;
				421	}
				422
				423	if (!rules->fDictionarySet.contains(c)) {
				424	fString.append(c);
				425	++n;
				426	}
				427	}
				428
				429	// Reset each rule matcher regex with this new string.
				430	// (Although we are always using the same string object, ICU regular expressions
				431	// don't like the underlying string data changing without doing a reset).
				432
				433	for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
				434	BreakRule rule = static_cast<BreakRule >(rules->fBreakRules.elementAt(ruleNum));
				435	rule->fRuleMatcher->reset(fString);
				436	}
				437
				438	// Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
				439	// Expected and Actual breaks are one longer than the input string; a non-zero value
				440	// will indicate a boundary preceding that position.
				441
				442	clearActualBreaks();
				443	fExpectedBreaks = fActualBreaks;
				444	fRuleForPosition = fActualBreaks;
				445	f2ndRuleForPos = fActualBreaks;
				446
				447	// Apply reference rules to find the expected breaks.
				448
				449	fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
				450	// ICU always reports a break there.
				451	// The reference rules do not have a means to do so.
				452	int32_t strIdx = 0;
				453	bool initialMatch = true; // True at start of text, and immediately after each boundary,
				454	// for control over rule chaining.
				455	while (strIdx < fString.length()) {
				456	BreakRule *matchingRule = NULL;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	457	UBool hasBreak = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	458	int32_t ruleNum = 0;
				459	int32_t matchStart = 0;
				460	int32_t matchEnd = 0;
				461	int32_t breakGroup = 0;
				462	for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
				463	BreakRule rule = static_cast<BreakRule >(rules->fBreakRules.elementAt(ruleNum));
				464	if (rule->fInitialMatchOnly && !initialMatch) {
				465	// Skip checking this '^' rule. (No rule chaining)
				466	continue;
				467	}
				468	rule->fRuleMatcher->reset();
				469	if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
				470	// A candidate rule match, check further to see if we take it or continue to check other rules.
				471	// Matches of zero or one codepoint count only if they also specify a break.
				472	matchStart = rule->fRuleMatcher->start(status);
				473	matchEnd = rule->fRuleMatcher->end(status);
				474	breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
				475	hasBreak = U_SUCCESS(status);
				476	if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
				477	status = U_ZERO_ERROR;
				478	}
				479	if (hasBreak \|\| fString.moveIndex32(matchStart, 1) < matchEnd) {
				480	matchingRule = rule;
				481	break;
				482	}
				483	}
				484	}
				485	if (matchingRule == NULL) {
				486	// No reference rule matched. This is an error in the rules that should never happen.
				487	IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
				488	__FILE__, __LINE__, strIdx);
				489	dump(strIdx);
				490	status = U_INVALID_FORMAT_ERROR;
				491	return;
				492	}
				493	if (matchingRule->fRuleMatcher->group(status).length() == 0) {
				494	// Zero length rule match. This is also an error in the rule expressions.
				495	IntlTest::gTest->errln("%s:%d Zero length rule match.",
				496	__FILE__, __LINE__);
				497	status = U_INVALID_FORMAT_ERROR;
				498	return;
				499	}
				500
				501	// Record which rule matched over the length of the match.
				502	for (int i = matchStart; i < matchEnd; i++) {
				503	if (fRuleForPosition.charAt(i) == 0) {
				504	fRuleForPosition.setCharAt(i, (UChar)ruleNum);
				505	} else {
				506	f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
				507	}
				508	}
				509
				510	// Break positions appear in rules as a matching named capture of zero length at the break position,
				511	// the adjusted pattern contains (?<BreakPosition>)
				512	if (hasBreak) {
				513	int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
				514	if (U_FAILURE(status) \|\| breakPos < 0) {
				515	// Rule specified a break, but that break wasn't part of the match, even
				516	// though the rule as a whole matched.
				517	// Can't happen with regular expressions derived from (equivalent to) ICU break rules.
				518	// Shouldn't get here.
				519	IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
				520	status = U_INVALID_FORMAT_ERROR;
				521	break;
				522	}
				523	fExpectedBreaks.setCharAt(breakPos, (UChar)1);
				524	// printf("recording break at %d\n", breakPos);
				525	// For the next iteration, pick up applying rules immediately after the break,
				526	// which may differ from end of the match. The matching rule may have included
				527	// context following the boundary that needs to be looked at again.
				528	strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
				529	initialMatch = true;
				530	} else {
				531	// Original rule didn't specify a break.
				532	// Continue applying rules starting on the last code point of this match.
				533	strIdx = fString.moveIndex32(matchEnd, -1);
				534	initialMatch = false;
				535	if (strIdx == matchStart) {
				536	// Match was only one code point, no progress if we continue.
				537	// Shouldn't get here, case is filtered out at top of loop.
				538	CharString ruleName;
				539	ruleName.appendInvariantChars(matchingRule->fName, status);
				540	IntlTest::gTest->errln("%s:%d Rule %s internal error",
				541	__FILE__, __LINE__, ruleName.data());
				542	status = U_INVALID_FORMAT_ERROR;
				543	break;
				544	}
				545	}
				546	if (U_FAILURE(status)) {
				547	IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
				548	__FILE__, __LINE__, u_errorName(status));
				549	break;
				550	}
				551	}
				552	}
				553
				554	void MonkeyTestData::clearActualBreaks() {
				555	fActualBreaks.remove();
				556	// Actual Breaks length is one longer than the data string length, allowing
				557	// for breaks before the first and after the last character in the data.
				558	for (int32_t i=0; i<=fString.length(); i++) {
				559	fActualBreaks.append((UChar)0);
				560	}
				561	}
				562
				563	void MonkeyTestData::dump(int32_t around) const {
				564	printf("\n"
				565	" char break Rule Character\n"
				566	" pos code class R I name name\n"
				567	"---------------------------------------------------------------------------------------------\n");
				568
				569	int32_t start;
				570	int32_t end;
				571
				572	if (around == -1) {
				573	start = 0;
				574	end = fString.length();
				575	} else {
				576	// Display context around a failure.
				577	start = fString.moveIndex32(around, -30);
				578	end = fString.moveIndex32(around, +30);
				579	}
				580
				581	for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
				582	UErrorCode status = U_ZERO_ERROR;
				583	UChar32 c = fString.char32At(charIdx);
				584	const CharClass *cc = fBkRules->getClassForChar(c);
				585	CharString ccName;
				586	ccName.appendInvariantChars(cc->fName, status);
				587	CharString ruleName, secondRuleName;
				588	const BreakRule rule = static_cast<BreakRule >(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
				589	ruleName.appendInvariantChars(rule->fName, status);
				590	if (f2ndRuleForPos.charAt(charIdx) > 0) {
				591	const BreakRule secondRule = static_cast<BreakRule >(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
				592	secondRuleName.appendInvariantChars(secondRule->fName, status);
				593	}
				594	char cName[200];
				595	u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
				596
				597	printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
				598	charIdx, c, ccName.data(),
				599	fExpectedBreaks.charAt(charIdx) ? '*' : '.',
				600	fActualBreaks.charAt(charIdx) ? '*' : '.',
				601	ruleName.data(), secondRuleName.data(), cName
				602	);
				603	}
				604	}
				605
				606
				607	//---------------------------------------------------------------------------------------
				608	//
				609	// class RBBIMonkeyImpl
				610	//
				611	//---------------------------------------------------------------------------------------
				612
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	613	RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(false), fThread(this) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	614	(void)status; // suppress unused parameter compiler warning.
				615	}
				616
				617
				618	// RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
				619	// reference rules and creating the icu breakiterator to test,
				620	// with its type and locale coming from the reference rules.
				621
				622	void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
				623	fRuleFileName = ruleFile;
				624	openBreakRules(ruleFile, status);
				625	if (U_FAILURE(status)) {
				626	IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
				627	return;
				628	}
				629	fRuleSet.adoptInstead(new BreakRules(this, status));
				630	fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
				631	if (U_FAILURE(status)) {
				632	IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
				633	return;
				634	}
				635	fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
				636	fTestData.adoptInstead(new MonkeyTestData());
				637	}
				638
				639
				640	RBBIMonkeyImpl::~RBBIMonkeyImpl() {
				641	}
				642
				643
				644	void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
				645	CharString path;
				646	path.append(IntlTest::getSourceTestData(status), status);
				647	path.append("break_rules" U_FILE_SEP_STRING, status);
				648	path.appendPathPart(fileName, status);
				649	const char *codePage = "UTF-8";
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	650	fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, true, false, &status));
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	651	}
				652
				653
				654	void RBBIMonkeyImpl::startTest() {
				655	fThread.start(); // invokes runTest() in a separate thread.
				656	}
				657
				658	void RBBIMonkeyImpl::join() {
				659	fThread.join();
				660	}
				661
				662
				663	#define MONKEY_ERROR(msg, index) UPRV_BLOCK_MACRO_BEGIN { \
				664	IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
				665	__FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
				666	if (fVerbose) { fTestData->dump(index); } \
				667	status = U_INVALID_STATE_ERROR; \
				668	} UPRV_BLOCK_MACRO_END
				669
				670	void RBBIMonkeyImpl::runTest() {
				671	UErrorCode status = U_ZERO_ERROR;
				672	int32_t errorCount = 0;
				673	for (int64_t loopCount = 0; fLoopCount < 0 \|\| loopCount < fLoopCount; loopCount++) {
				674	status = U_ZERO_ERROR;
				675	fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
				676	if (fBI.isNull()) {
				677	IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
				678	return;
				679	}
				680	// fTestData->dump();
				681	testForwards(status);
				682	testPrevious(status);
				683	testFollowing(status);
				684	testPreceding(status);
				685	testIsBoundary(status);
				686	testIsBoundaryRandom(status);
				687
				688	if (fLoopCount < 0 && loopCount % 100 == 0) {
				689	fprintf(stderr, ".");
				690	}
				691	if (U_FAILURE(status)) {
				692	if (++errorCount > 10) {
				693	return;
				694	}
				695	}
				696	}
				697	}
				698
				699	void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
				700	if (U_FAILURE(status)) {
				701	return;
				702	}
				703	fTestData->clearActualBreaks();
				704	fBI->setText(fTestData->fString);
				705	int32_t previousBreak = -2;
				706	for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
				707	if (bk <= previousBreak) {
				708	MONKEY_ERROR("Break Iterator Stall", bk);
				709	return;
				710	}
				711	if (bk < 0 \|\| bk > fTestData->fString.length()) {
				712	MONKEY_ERROR("Boundary out of bounds", bk);
				713	return;
				714	}
				715	fTestData->fActualBreaks.setCharAt(bk, 1);
				716	}
				717	checkResults("testForwards", FORWARD, status);
				718	}
				719
				720	void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
				721	if (U_FAILURE(status)) {
				722	return;
				723	}
				724	fTestData->clearActualBreaks();
				725	fBI->setText(fTestData->fString);
				726	int32_t nextBreak = -1;
				727	for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
				728	int32_t bk = fBI->following(i);
				729	if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
				730	continue;
				731	}
				732	if (bk == nextBreak && bk > i) {
				733	// i is in the gap between two breaks.
				734	continue;
				735	}
				736	if (i == nextBreak && bk > nextBreak) {
				737	fTestData->fActualBreaks.setCharAt(bk, 1);
				738	nextBreak = bk;
				739	continue;
				740	}
				741	MONKEY_ERROR("following(i)", i);
				742	return;
				743	}
				744	checkResults("testFollowing", FORWARD, status);
				745	}
				746
				747
				748
				749	void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
				750	if (U_FAILURE(status)) {return;}
				751
				752	fTestData->clearActualBreaks();
				753	fBI->setText(fTestData->fString);
				754	int32_t previousBreak = INT32_MAX;
				755	for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
				756	if (bk >= previousBreak) {
				757	MONKEY_ERROR("Break Iterator Stall", bk);
				758	return;
				759	}
				760	if (bk < 0 \|\| bk > fTestData->fString.length()) {
				761	MONKEY_ERROR("Boundary out of bounds", bk);
				762	return;
				763	}
				764	fTestData->fActualBreaks.setCharAt(bk, 1);
				765	}
				766	checkResults("testPrevius", REVERSE, status);
				767	}
				768
				769
				770	void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
				771	if (U_FAILURE(status)) {
				772	return;
				773	}
				774	fTestData->clearActualBreaks();
				775	fBI->setText(fTestData->fString);
				776	int32_t nextBreak = fTestData->fString.length()+1;
				777	for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
				778	int32_t bk = fBI->preceding(i);
				779	// printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
				780	if (bk == BreakIterator::DONE && i == 0) {
				781	continue;
				782	}
				783	if (bk == nextBreak && bk < i) {
				784	// i is in the gap between two breaks.
				785	continue;
				786	}
				787	if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
				788	// i indexes to a trailing surrogate.
				789	// Break Iterators treat an index to either half as referring to the supplemental code point,
				790	// with preceding going to some preceding code point.
				791	if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
				792	MONKEY_ERROR("preceding of trailing surrogate error", i);
				793	}
				794	continue;
				795	}
				796	if (i == nextBreak && bk < nextBreak) {
				797	fTestData->fActualBreaks.setCharAt(bk, 1);
				798	nextBreak = bk;
				799	continue;
				800	}
				801	MONKEY_ERROR("preceding(i)", i);
				802	return;
				803	}
				804	checkResults("testPreceding", REVERSE, status);
				805	}
				806
				807
				808	void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
				809	if (U_FAILURE(status)) {
				810	return;
				811	}
				812	fTestData->clearActualBreaks();
				813	fBI->setText(fTestData->fString);
				814	for (int i=fTestData->fString.length(); i>=0; --i) {
				815	if (fBI->isBoundary(i)) {
				816	fTestData->fActualBreaks.setCharAt(i, 1);
				817	}
				818	}
				819	checkResults("testForwards", FORWARD, status);
				820	}
				821
				822	void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) {
				823	if (U_FAILURE(status)) {
				824	return;
				825	}
				826	fBI->setText(fTestData->fString);
				827
				828	int stringLen = fTestData->fString.length();
				829	for (int i=stringLen; i>=0; --i) {
				830	int strIdx = fRandomGenerator() % stringLen;
				831	if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) {
				832	IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
				833	__FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed);
				834	if (fVerbose) {
				835	fTestData->dump(i);
				836	}
				837	status = U_INVALID_STATE_ERROR;
				838	break;
				839	}
				840	}
				841	}
				842
				843
				844
				845	void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
				846	if (U_FAILURE(status)) {
				847	return;
				848	}
				849	if (direction == FORWARD) {
				850	for (int i=0; i<=fTestData->fString.length(); ++i) {
				851	if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
				852	IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
				853	__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
				854	if (fVerbose) {
				855	fTestData->dump(i);
				856	}
				857	status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
				858	break; // produce many redundant errors.
				859	}
				860	}
				861	} else {
				862	for (int i=fTestData->fString.length(); i>=0; i--) {
				863	if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
				864	IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
				865	__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
				866	if (fVerbose) {
				867	fTestData->dump(i);
				868	}
				869	status = U_INVALID_STATE_ERROR;
				870	break;
				871	}
				872	}
				873	}
				874	}
				875
				876
				877
				878	//---------------------------------------------------------------------------------------
				879	//
				880	// class RBBIMonkeyTest implementation.
				881	//
				882	//---------------------------------------------------------------------------------------
				883	RBBIMonkeyTest::RBBIMonkeyTest() {
				884	}
				885
				886	RBBIMonkeyTest::~RBBIMonkeyTest() {
				887	}
				888
				889
				890	// params, taken from this->fParams.
				891	// rules=file_name Name of file containing the reference rules.
				892	// seed=nnnnn Random number starting seed.
				893	// Setting the seed allows errors to be reproduced.
				894	// loop=nnn Looping count. Controls running time.
				895	// -1: run forever.
				896	// 0 or greater: run length.
				897	// expansions debug option, show expansions of rules and sets.
				898	// verbose Display details of the failure.
				899	//
				900	// Parameters on the intltest command line follow the test name, and are preceded by '@'.
				901	// For example,
				902	// intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
				903	//
				904	void RBBIMonkeyTest::testMonkey() {
				905	// printf("Test parameters: %s\n", fParams);
				906	UnicodeString params(fParams);
				907	UErrorCode status = U_ZERO_ERROR;
				908
				909	const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
				910	"line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
				911	NULL };
				912	CharString testNameFromParams;
				913	if (getStringParam("rules", params, testNameFromParams, status)) {
				914	tests[0] = testNameFromParams.data();
				915	tests[1] = NULL;
				916	}
				917
				918	int64_t loopCount = quick? 100 : 5000;
				919	getIntParam("loop", params, loopCount, status);
				920
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	921	UBool dumpExpansions = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	922	getBoolParam("expansions", params, dumpExpansions, status);
				923
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	924	UBool verbose = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	925	getBoolParam("verbose", params, verbose, status);
				926
				927	int64_t seed = 0;
				928	getIntParam("seed", params, seed, status);
				929
				930	if (params.length() != 0) {
				931	// Options processing did not consume all of the parameters. Something unrecognized was present.
				932	CharString unrecognizedParameters;
				933	unrecognizedParameters.append(CStr(params)(), -1, status);
				934	errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
				935	return;
				936	}
				937
				938	UVector startedTests(status);
				939	if (U_FAILURE(status)) {
				940	errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
				941	return;
				942	}
				943
				944	// Monkey testing is multi-threaded.
				945	// Each set of break rules to be tested is run in a separate thread.
				946	// Each thread/set of rules gets a separate RBBIMonkeyImpl object.
				947	int32_t i;
				948	for (i=0; tests[i] != NULL; ++i) {
				949	logln("beginning testing of %s", tests[i]);
				950	LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status));
				951	if (U_FAILURE(status)) {
				952	dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
				953	break;
				954	}
				955	test->fDumpExpansions = dumpExpansions;
				956	test->fVerbose = verbose;
				957	test->fRandomGenerator.seed(static_cast<uint32_t>(seed));
				958	test->fLoopCount = static_cast<int32_t>(loopCount);
				959	test->setup(tests[i], status);
				960	if (U_FAILURE(status)) {
				961	dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
				962	break;
				963	}
				964	test->startTest();
				965	startedTests.addElement(test.orphan(), status);
				966	if (U_FAILURE(status)) {
				967	errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
				968	break;
				969	}
				970	}
				971
				972	for (i=0; i<startedTests.size(); ++i) {
				973	RBBIMonkeyImpl test = static_cast<RBBIMonkeyImpl >(startedTests.elementAt(i));
				974	test->join();
				975	delete test;
				976	}
				977	}
				978
				979
				980	UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
				981	name.append(" = (-?\\d+) ,? ");
				982	RegexMatcher m(name, params, 0, status);
				983	if (m.find()) {
				984	// The param exists. Convert the string to an int.
				985	CharString str;
				986	str.append(CStr(m.group(1, status))(), -1, status);
				987	val = strtol(str.data(), NULL, 10);
				988
				989	// Delete this parameter from the params string.
				990	m.reset();
				991	params = m.replaceFirst(UnicodeString(), status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	992	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	993	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	994	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	995	}
				996
				997	UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
				998	name.append(" = ([^ ,]) ,? *");
				999	RegexMatcher m(name, params, 0, status);
				1000	if (m.find()) {
				1001	// The param exists.
				1002	dest.append(CStr(m.group(1, status))(), -1, status);
				1003
				1004	// Delete this parameter from the params string.
				1005	m.reset();
				1006	params = m.replaceFirst(UnicodeString(), status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1007	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1008	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1009	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1010	}
				1011
				1012	UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
				1013	name.append("(?: = (true\|false))? ,? ");
				1014	RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
				1015	if (m.find()) {
				1016	if (m.start(1, status) > 0) {
				1017	// user option included a value.
				1018	dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
				1019	} else {
				1020	// No explicit user value, implies true.
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1021	dest = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1022	}
				1023
				1024	// Delete this parameter from the params string.
				1025	m.reset();
				1026	params = m.replaceFirst(UnicodeString(), status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1027	return true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1028	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1029	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1030	}
				1031
				1032	#endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */