Blame - source/test/intltest/rbbitst.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 8272dfd422274932e0572061bab7fb81d29d177a [file] [log] [blame]

Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/********************************************************************
				4	* COPYRIGHT:
				5	* Copyright (c) 1999-2016, International Business Machines Corporation and
				6	* others. All Rights Reserved.
				7	********************************************************************/
				8	/************************************************************************
				9	* Date Name Description
				10	* 12/15/99 Madhu Creation.
				11	* 01/12/2000 Madhu Updated for changed API and added new tests
				12	************************************************************************/
				13
				14	#include "unicode/utypes.h"
				15	#if !UCONFIG_NO_BREAK_ITERATION
				16
				17	#include <algorithm>
				18	#include <sstream>
				19	#include <stdio.h>
				20	#include <stdlib.h>
				21	#include <string.h>
				22	#include <utility>
				23	#include <vector>
				24
				25	#include "unicode/brkiter.h"
				26	#include "unicode/localpointer.h"
				27	#include "unicode/numfmt.h"
				28	#include "unicode/rbbi.h"
				29	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				30	#include "unicode/regex.h"
				31	#endif
				32	#include "unicode/schriter.h"
				33	#include "unicode/uchar.h"
				34	#include "unicode/utf16.h"
				35	#include "unicode/ucnv.h"
				36	#include "unicode/uniset.h"
				37	#include "unicode/uscript.h"
				38	#include "unicode/ustring.h"
				39	#include "unicode/utext.h"
				40	#include "unicode/utrace.h"
				41
				42	#include "charstr.h"
				43	#include "cmemory.h"
				44	#include "cstr.h"
				45	#include "intltest.h"
				46	#include "lstmbe.h"
				47	#include "rbbitst.h"
				48	#include "rbbidata.h"
				49	#include "utypeinfo.h" // for 'typeid' to work
				50	#include "uvector.h"
				51	#include "uvectr32.h"
				52
				53
				54	#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
				55	#include "unicode/filteredbrk.h"
				56	#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
				57
				58	#define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
				59	if (!(x)) { \
				60	errln("Failure in file %s, line %d", __FILE__, __LINE__); \
				61	} \
				62	} UPRV_BLOCK_MACRO_END
				63
				64	#define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
				65	if (U_FAILURE(errcode)) { \
				66	errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
				67	} \
				68	} UPRV_BLOCK_MACRO_END
				69
				70	#define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
				71	IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
				72	__FILE__, __LINE__, msg, index, fRuleFileName, seed); \
				73	}
				74
				75	//---------------------------------------------
				76	// runIndexedTest
				77	//---------------------------------------------
				78
				79
				80	// Note: Before adding new tests to this file, check whether the desired test data can
				81	// simply be added to the file testdata/rbbitest.txt. In most cases it can,
				82	// it's much less work than writing a new test, diagnostic output in the event of failures
				83	// is good, and the test data file will is shared with ICU4J, so eventually the test
				84	// will run there as well, without additional effort.
				85
				86	void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
				87	{
				88	if (exec) logln("TestSuite RuleBasedBreakIterator: ");
				89	fTestParams = params;
				90
				91	TESTCASE_AUTO_BEGIN;
				92	#if !UCONFIG_NO_FILE_IO
				93	TESTCASE_AUTO(TestBug4153072);
				94	#endif
				95	#if !UCONFIG_NO_FILE_IO
				96	TESTCASE_AUTO(TestUnicodeFiles);
				97	#endif
				98	TESTCASE_AUTO(TestGetAvailableLocales);
				99	TESTCASE_AUTO(TestGetDisplayName);
				100	#if !UCONFIG_NO_FILE_IO
				101	TESTCASE_AUTO(TestEndBehaviour);
				102	TESTCASE_AUTO(TestWordBreaks);
				103	TESTCASE_AUTO(TestWordBoundary);
				104	TESTCASE_AUTO(TestLineBreaks);
				105	TESTCASE_AUTO(TestSentBreaks);
				106	TESTCASE_AUTO(TestExtended);
				107	#endif
				108	#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
				109	TESTCASE_AUTO(TestMonkey);
				110	#endif
				111	#if !UCONFIG_NO_FILE_IO
				112	TESTCASE_AUTO(TestBug3818);
				113	#endif
				114	TESTCASE_AUTO(TestDebug);
				115	#if !UCONFIG_NO_FILE_IO
				116	TESTCASE_AUTO(TestBug5775);
				117	#endif
				118	TESTCASE_AUTO(TestBug9983);
				119	TESTCASE_AUTO(TestDictRules);
				120	TESTCASE_AUTO(TestBug5532);
				121	TESTCASE_AUTO(TestBug7547);
				122	TESTCASE_AUTO(TestBug12797);
				123	TESTCASE_AUTO(TestBug12918);
				124	TESTCASE_AUTO(TestBug12932);
				125	TESTCASE_AUTO(TestEmoji);
				126	TESTCASE_AUTO(TestBug12519);
				127	TESTCASE_AUTO(TestBug12677);
				128	TESTCASE_AUTO(TestTableRedundancies);
				129	TESTCASE_AUTO(TestBug13447);
				130	TESTCASE_AUTO(TestReverse);
				131	TESTCASE_AUTO(TestBug13692);
				132	TESTCASE_AUTO(TestDebugRules);
				133	TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
				134	TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
				135	TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
				136	TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
				137	TESTCASE_AUTO(TestTable_8_16_Bits);
				138	TESTCASE_AUTO(TestBug13590);
				139	TESTCASE_AUTO(TestUnpairedSurrogate);
				140	TESTCASE_AUTO(TestLSTMThai);
				141	TESTCASE_AUTO(TestLSTMBurmese);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	142	TESTCASE_AUTO(TestRandomAccess);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	143
				144	#if U_ENABLE_TRACING
				145	TESTCASE_AUTO(TestTraceCreateCharacter);
				146	TESTCASE_AUTO(TestTraceCreateWord);
				147	TESTCASE_AUTO(TestTraceCreateSentence);
				148	TESTCASE_AUTO(TestTraceCreateTitle);
				149	TESTCASE_AUTO(TestTraceCreateLine);
				150	TESTCASE_AUTO(TestTraceCreateLineNormal);
				151	TESTCASE_AUTO(TestTraceCreateLineLoose);
				152	TESTCASE_AUTO(TestTraceCreateLineStrict);
Frank Tang	d2858cb	2022-04-08 20:34:12 -0700	[diff] [blame]	153	TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
				154	TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
				155	TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
				156	TESTCASE_AUTO(TestTraceCreateLinePhrase);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	157	TESTCASE_AUTO(TestTraceCreateBreakEngine);
				158	#endif
				159
				160	TESTCASE_AUTO_END;
				161	}
				162
				163
				164	//--------------------------------------------------------------------------------------
				165	//
				166	// RBBITest constructor and destructor
				167	//
				168	//--------------------------------------------------------------------------------------
				169
				170	RBBITest::RBBITest() {
				171	fTestParams = NULL;
				172	}
				173
				174
				175	RBBITest::~RBBITest() {
				176	}
				177
				178
				179	static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
				180	UErrorCode status = U_ZERO_ERROR;
				181	char name[100];
				182	printf("code alpha extend alphanum type word sent line name\n");
				183	int nextExpectedIndex = 0;
				184	utext_setNativeIndex(tstr, 0);
				185	for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
				186	if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
				187	printf("------------------------------------------------ %d\n", j);
				188	++nextExpectedIndex;
				189	}
				190
				191	UChar32 c = utext_next32(tstr);
				192	u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
				193	printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
				194	u_isUAlphabetic(c),
				195	u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
				196	u_isalnum(c),
				197	u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
				198	u_charType(c),
				199	U_SHORT_PROPERTY_NAME),
				200	u_getPropertyValueName(UCHAR_WORD_BREAK,
				201	u_getIntPropertyValue(c,
				202	UCHAR_WORD_BREAK),
				203	U_SHORT_PROPERTY_NAME),
				204	u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
				205	u_getIntPropertyValue(c,
				206	UCHAR_SENTENCE_BREAK),
				207	U_SHORT_PROPERTY_NAME),
				208	u_getPropertyValueName(UCHAR_LINE_BREAK,
				209	u_getIntPropertyValue(c,
				210	UCHAR_LINE_BREAK),
				211	U_SHORT_PROPERTY_NAME),
				212	name);
				213	}
				214	}
				215
				216
				217	static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
				218	UErrorCode status = U_ZERO_ERROR;
				219	UText *tstr = NULL;
				220	tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
				221	if (U_FAILURE(status)) {
				222	printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
				223	return;
				224	}
				225	printStringBreaks(tstr, expected, expectedCount);
				226	utext_close(tstr);
				227	}
				228
				229
				230	void RBBITest::TestBug3818() {
				231	UErrorCode status = U_ZERO_ERROR;
				232
				233	// Four Thai words...
				234	static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
				235	0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
				236	UnicodeString thaiStr(thaiWordData);
				237
				238	BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
				239	if (U_FAILURE(status) \|\| bi == NULL) {
				240	errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
				241	return;
				242	}
				243	bi->setText(thaiStr);
				244
				245	int32_t startOfSecondWord = bi->following(1);
				246	if (startOfSecondWord != 4) {
				247	errln("Fail at file %s, line %d expected start of word at 4, got %d",
				248	__FILE__, __LINE__, startOfSecondWord);
				249	}
				250	startOfSecondWord = bi->following(0);
				251	if (startOfSecondWord != 4) {
				252	errln("Fail at file %s, line %d expected start of word at 4, got %d",
				253	__FILE__, __LINE__, startOfSecondWord);
				254	}
				255	delete bi;
				256	}
				257
				258
				259	//---------------------------------------------
				260	//
				261	// other tests
				262	//
				263	//---------------------------------------------
				264
				265	void RBBITest::TestGetAvailableLocales()
				266	{
				267	int32_t locCount = 0;
				268	const Locale* locList = BreakIterator::getAvailableLocales(locCount);
				269
				270	if (locCount == 0)
				271	dataerrln("getAvailableLocales() returned an empty list!");
				272	// Just make sure that it's returning good memory.
				273	int32_t i;
				274	for (i = 0; i < locCount; ++i) {
				275	logln(locList[i].getName());
				276	}
				277	}
				278
				279	//Testing the BreakIterator::getDisplayName() function
				280	void RBBITest::TestGetDisplayName()
				281	{
				282	UnicodeString result;
				283
				284	BreakIterator::getDisplayName(Locale::getUS(), result);
				285	if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
				286	dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
				287	+ result);
				288
				289	BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
				290	if (result != "French (France)")
				291	dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
				292	+ result);
				293	}
				294	/**
				295	* Test End Behaviour
				296	* @bug 4068137
				297	*/
				298	void RBBITest::TestEndBehaviour()
				299	{
				300	UErrorCode status = U_ZERO_ERROR;
				301	UnicodeString testString("boo.");
				302	BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
				303	if (U_FAILURE(status))
				304	{
				305	errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
				306	return;
				307	}
				308	wb->setText(testString);
				309
				310	if (wb->first() != 0)
				311	errln("Didn't get break at beginning of string.");
				312	if (wb->next() != 3)
				313	errln("Didn't get break before period in \"boo.\"");
				314	if (wb->current() != 4 && wb->next() != 4)
				315	errln("Didn't get break at end of string.");
				316	delete wb;
				317	}
				318	/*
				319	* @bug 4153072
				320	*/
				321	void RBBITest::TestBug4153072() {
				322	UErrorCode status = U_ZERO_ERROR;
				323	BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
				324	if (U_FAILURE(status))
				325	{
				326	errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
				327	return;
				328	}
				329	UnicodeString str("...Hello, World!...");
				330	int32_t begin = 3;
				331	int32_t end = str.length() - 3;
				332	UBool onBoundary;
				333
				334	StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
				335	iter->adoptText(textIterator);
				336	int index;
				337	// Note: with the switch to UText, there is no way to restrict the
				338	// iteration range to begin at an index other than zero.
				339	// String character iterators created with a non-zero bound are
				340	// treated by RBBI as being empty.
				341	for (index = -1; index < begin + 1; ++index) {
				342	onBoundary = iter->isBoundary(index);
				343	if (index == 0? !onBoundary : onBoundary) {
				344	errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
				345	" and begin index = " + begin);
				346	}
				347	}
				348	delete iter;
				349	}
				350
				351
				352	//
				353	// Test for problem reported by Ashok Matoria on 9 July 2007
				354	// One.<kSoftHyphen><kSpace>Two.
				355	//
				356	// Sentence break at start (0) and then on calling next() it breaks at
				357	// 'T' of "Two". Now, at this point if I do next() and
				358	// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
				359	//
				360	void RBBITest::TestBug5775() {
				361	UErrorCode status = U_ZERO_ERROR;
				362	BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
				363	TEST_ASSERT_SUCCESS(status);
				364	if (U_FAILURE(status)) {
				365	return;
				366	}
				367	// Check for status first for better handling of no data errors.
				368	TEST_ASSERT(bi != NULL);
				369	if (bi == NULL) {
				370	return;
				371	}
				372
				373	UnicodeString s("One.\\u00ad Two.", -1, US_INV);
				374	// 01234 56789
				375	s = s.unescape();
				376	bi->setText(s);
				377	int pos = bi->next();
				378	TEST_ASSERT(pos == 6);
				379	pos = bi->next();
				380	TEST_ASSERT(pos == 10);
				381	pos = bi->previous();
				382	TEST_ASSERT(pos == 6);
				383	delete bi;
				384	}
				385
				386
				387
				388	//------------------------------------------------------------------------------
				389	//
				390	// RBBITest::Extended Run RBBI Tests from an external test data file
				391	//
				392	//------------------------------------------------------------------------------
				393
				394	struct TestParams {
				395	BreakIterator *bi; // Break iterator is set while parsing test source.
				396	// Changed out whenever test data changes break type.
				397
				398	UnicodeString dataToBreak; // Data that is built up while parsing the test.
				399	UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
				400	UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
				401	UVector32 *srcCol;
				402
				403	UText *textToBreak; // UText, could be UTF8 or UTF16.
				404	UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
				405	CharString utf8String; // UTF-8 form of text to break.
				406
				407	TestParams(UErrorCode &status) : dataToBreak() {
				408	bi = NULL;
				409	expectedBreaks = new UVector32(status);
				410	srcLine = new UVector32(status);
				411	srcCol = new UVector32(status);
				412	textToBreak = NULL;
				413	textMap = new UVector32(status);
				414	}
				415
				416	~TestParams() {
				417	delete bi;
				418	delete expectedBreaks;
				419	delete srcLine;
				420	delete srcCol;
				421	utext_close(textToBreak);
				422	delete textMap;
				423	}
				424
				425	int32_t getSrcLine(int32_t bp);
				426	int32_t getExpectedBreak(int32_t bp);
				427	int32_t getSrcCol(int32_t bp);
				428
				429	void setUTF16(UErrorCode &status);
				430	void setUTF8(UErrorCode &status);
				431	};
				432
				433	// Append a UnicodeString to a CharString with UTF-8 encoding.
				434	// Substitute any invalid chars.
				435	// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
				436	static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
				437	if (U_FAILURE(status)) {
				438	return;
				439	}
				440	int32_t utf8Length;
				441	u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
				442	src.getBuffer(), src.length(), // UTF-16 data
				443	0xfffd, NULL, // Substitution char, number of subs.
				444	&status);
				445	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
				446	return;
				447	}
				448	status = U_ZERO_ERROR;
				449	int32_t capacity;
				450	char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
				451	u_strToUTF8WithSub(buffer, utf8Length, NULL,
				452	src.getBuffer(), src.length(),
				453	0xfffd, NULL, &status);
				454	dest.append(buffer, utf8Length, status);
				455	}
				456
				457
				458	void TestParams::setUTF16(UErrorCode &status) {
				459	textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
				460	textMap->removeAllElements();
				461	for (int32_t i=0; i<dataToBreak.length(); i++) {
				462	if (i == dataToBreak.getChar32Start(i)) {
				463	textMap->addElement(i, status);
				464	} else {
				465	textMap->addElement(-1, status);
				466	}
				467	}
				468	textMap->addElement(dataToBreak.length(), status);
				469	U_ASSERT(dataToBreak.length() + 1 == textMap->size());
				470	}
				471
				472
				473	void TestParams::setUTF8(UErrorCode &status) {
				474	if (U_FAILURE(status)) {
				475	return;
				476	}
				477	utf8String.clear();
				478	CharStringAppend(utf8String, dataToBreak, status);
				479	textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
				480	if (U_FAILURE(status)) {
				481	return;
				482	}
				483
				484	textMap->removeAllElements();
				485	int32_t utf16Index = 0;
				486	for (;;) {
				487	textMap->addElement(utf16Index, status);
				488	UChar32 c32 = utext_current32(textToBreak);
				489	if (c32 < 0) {
				490	break;
				491	}
				492	utf16Index += U16_LENGTH(c32);
				493	utext_next32(textToBreak);
				494	while (textMap->size() < utext_getNativeIndex(textToBreak)) {
				495	textMap->addElement(-1, status);
				496	}
				497	}
				498	U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
				499	}
				500
				501
				502	int32_t TestParams::getSrcLine(int32_t bp) {
				503	if (bp >= textMap->size()) {
				504	bp = textMap->size() - 1;
				505	}
				506	int32_t i = 0;
				507	for(; bp >= 0 ; --bp) {
				508	// Move to a character boundary if we are not on one already.
				509	i = textMap->elementAti(bp);
				510	if (i >= 0) {
				511	break;
				512	}
				513	}
				514	return srcLine->elementAti(i);
				515	}
				516
				517
				518	int32_t TestParams::getExpectedBreak(int32_t bp) {
				519	if (bp >= textMap->size()) {
				520	return 0;
				521	}
				522	int32_t i = textMap->elementAti(bp);
				523	int32_t retVal = 0;
				524	if (i >= 0) {
				525	retVal = expectedBreaks->elementAti(i);
				526	}
				527	return retVal;
				528	}
				529
				530
				531	int32_t TestParams::getSrcCol(int32_t bp) {
				532	if (bp >= textMap->size()) {
				533	bp = textMap->size() - 1;
				534	}
				535	int32_t i = 0;
				536	for(; bp >= 0; --bp) {
				537	// Move bp to a character boundary if we are not on one already.
				538	i = textMap->elementAti(bp);
				539	if (i >= 0) {
				540	break;
				541	}
				542	}
				543	return srcCol->elementAti(i);
				544	}
				545
				546
				547	void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
				548	int32_t bp;
				549	int32_t prevBP;
				550	int32_t i;
				551
				552	TEST_ASSERT_SUCCESS(status);
				553	if (U_FAILURE(status)) {
				554	return;
				555	}
				556
				557	if (t->bi == NULL) {
				558	return;
				559	}
				560
				561	t->bi->setText(t->textToBreak, status);
				562	//
				563	// Run the iterator forward
				564	//
				565	prevBP = -1;
				566	for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
				567	if (prevBP == bp) {
				568	// Fail for lack of forward progress.
				569	errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
				570	bp, t->getSrcLine(bp), t->getSrcCol(bp));
				571	break;
				572	}
				573
				574	// Check that there we didn't miss an expected break between the last one
				575	// and this one.
				576	for (i=prevBP+1; i<bp; i++) {
				577	if (t->getExpectedBreak(i) != 0) {
				578	int expected[] = {0, i};
				579	printStringBreaks(t->dataToBreak, expected, 2);
				580	errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
				581	i, t->getSrcLine(i), t->getSrcCol(i));
				582	}
				583	}
				584
				585	// Check that the break we did find was expected
				586	if (t->getExpectedBreak(bp) == 0) {
				587	int expected[] = {0, bp};
				588	printStringBreaks(t->textToBreak, expected, 2);
				589	errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
				590	bp, t->getSrcLine(bp), t->getSrcCol(bp));
				591	} else {
				592	// The break was expected.
				593	// Check that the {nnn} tag value is correct.
				594	int32_t expectedTagVal = t->getExpectedBreak(bp);
				595	if (expectedTagVal == -1) {
				596	expectedTagVal = 0;
				597	}
				598	int32_t line = t->getSrcLine(bp);
				599	int32_t rs = t->bi->getRuleStatus();
				600	if (rs != expectedTagVal) {
				601	errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
				602	" Actual, Expected status = %4d, %4d",
				603	bp, line, t->getSrcCol(bp), rs, expectedTagVal);
				604	}
				605	}
				606
				607	prevBP = bp;
				608	}
				609
				610	// Verify that there were no missed expected breaks after the last one found
				611	for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
				612	if (t->getExpectedBreak(i) != 0) {
				613	errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
				614	i, t->getSrcLine(i), t->getSrcCol(i));
				615	}
				616	}
				617
				618	//
				619	// Run the iterator backwards, verify that the same breaks are found.
				620	//
				621	prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
				622	bp = t->bi->last();
				623	while (bp != BreakIterator::DONE) {
				624	if (prevBP == bp) {
				625	// Fail for lack of progress.
				626	errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
				627	bp, t->getSrcLine(bp), t->getSrcCol(bp));
				628	break;
				629	}
				630
				631	// Check that we didn't miss an expected break between the last one
				632	// and this one. (UVector returns zeros for index out of bounds.)
				633	for (i=prevBP-1; i>bp; i--) {
				634	if (t->getExpectedBreak(i) != 0) {
				635	errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
				636	i, t->getSrcLine(i), t->getSrcCol(i));
				637	}
				638	}
				639
				640	// Check that the break we did find was expected
				641	if (t->getExpectedBreak(bp) == 0) {
				642	errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
				643	bp, t->getSrcLine(bp), t->getSrcCol(bp));
				644	} else {
				645	// The break was expected.
				646	// Check that the {nnn} tag value is correct.
				647	int32_t expectedTagVal = t->getExpectedBreak(bp);
				648	if (expectedTagVal == -1) {
				649	expectedTagVal = 0;
				650	}
				651	int line = t->getSrcLine(bp);
				652	int32_t rs = t->bi->getRuleStatus();
				653	if (rs != expectedTagVal) {
				654	errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
				655	" Actual, Expected status = %4d, %4d",
				656	bp, line, t->getSrcCol(bp), rs, expectedTagVal);
				657	}
				658	}
				659
				660	prevBP = bp;
				661	bp = t->bi->previous();
				662	}
				663
				664	// Verify that there were no missed breaks prior to the last one found
				665	for (i=prevBP-1; i>=0; i--) {
				666	if (t->getExpectedBreak(i) != 0) {
				667	errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
				668	i, t->getSrcLine(i), t->getSrcCol(i));
				669	}
				670	}
				671
				672	// Check isBoundary()
				673	for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
				674	UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
				675	UBool boundaryFound = t->bi->isBoundary(i);
				676	if (boundaryExpected != boundaryFound) {
				677	errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
				678	" Expected, Actual= %s, %s",
				679	i, t->getSrcLine(i), t->getSrcCol(i),
				680	boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
				681	}
				682	}
				683
				684	// Check following()
				685	for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
				686	int32_t actualBreak = t->bi->following(i);
				687	int32_t expectedBreak = BreakIterator::DONE;
				688	for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
				689	if (t->getExpectedBreak(j) != 0) {
				690	expectedBreak = j;
				691	break;
				692	}
				693	}
				694	if (expectedBreak != actualBreak) {
				695	errln("following(%d) incorrect. File line,col= %4d,%4d\n"
				696	" Expected, Actual= %d, %d",
				697	i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
				698	}
				699	}
				700
				701	// Check preceding()
				702	for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
				703	int32_t actualBreak = t->bi->preceding(i);
				704	int32_t expectedBreak = BreakIterator::DONE;
				705
				706	// For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
				707	// preceding(trailing byte) will return the index of some preceding code point,
				708	// not the lead byte of the current code point, even though that has a smaller index.
				709	// Therefore, start looking at the expected break data not at i-1, but at
				710	// the start of code point index - 1.
				711	utext_setNativeIndex(t->textToBreak, i);
				712	int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
				713	for (; j >= 0; j--) {
				714	if (t->getExpectedBreak(j) != 0) {
				715	expectedBreak = j;
				716	break;
				717	}
				718	}
				719	if (expectedBreak != actualBreak) {
				720	errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
				721	" Expected, Actual= %d, %d",
				722	i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
				723	}
				724	}
				725	}
				726
				727	void RBBITest::TestExtended() {
				728	// The expectations in this test heavily depends on the Thai dictionary.
				729	// Therefore, we skip this test under the LSTM configuration.
				730	if (skipDictionaryTest()) {
				731	return;
				732	}
				733	// Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
				734	// data driven test closely entangles filtered and regular data.
				735	#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
				736	UErrorCode status = U_ZERO_ERROR;
				737	Locale locale("");
				738
				739	TestParams tp(status);
				740
				741	RegexMatcher localeMatcher(UnicodeString(u"<locale ([\\p{L}\\p{Nd}_@&=-]) *>"), 0, status);
				742	if (U_FAILURE(status)) {
				743	dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
				744	}
				745
				746	//
				747	// Open and read the test data file.
				748	//
				749	const char *testDataDirectory = IntlTest::getSourceTestData(status);
				750	CharString testFileName(testDataDirectory, -1, status);
				751	testFileName.append("rbbitst.txt", -1, status);
				752
				753	int len;
				754	UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
				755	if (U_FAILURE(status)) {
				756	errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
				757	return;
				758	}
				759
				760	bool skipTest = false; // Skip this test?
				761
				762	//
				763	// Put the test data into a UnicodeString
				764	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	765	UnicodeString testString(false, testFile, len);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	766
				767	enum EParseState{
				768	PARSE_COMMENT,
				769	PARSE_TAG,
				770	PARSE_DATA,
				771	PARSE_NUM,
				772	PARSE_RULES
				773	}
				774	parseState = PARSE_TAG;
				775
				776	EParseState savedState = PARSE_TAG;
				777
				778	int32_t lineNum = 1;
				779	int32_t colStart = 0;
				780	int32_t column = 0;
				781	int32_t charIdx = 0;
				782
				783	int32_t tagValue = 0; // The numeric value of a <nnn> tag.
				784
				785	UnicodeString rules; // Holds rules from a <rules> ... </rules> block
				786	int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
				787
				788	for (charIdx = 0; charIdx < len; ) {
				789	status = U_ZERO_ERROR;
				790	UChar c = testString.charAt(charIdx);
				791	charIdx++;
				792	if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
				793	// treat CRLF as a unit
				794	c = u'\n';
				795	charIdx++;
				796	}
				797	if (c == u'\n' \|\| c == u'\r') {
				798	lineNum++;
				799	colStart = charIdx;
				800	}
				801	column = charIdx - colStart + 1;
				802
				803	switch (parseState) {
				804	case PARSE_COMMENT:
				805	if (c == u'\n' \|\| c == u'\r') {
				806	parseState = savedState;
				807	}
				808	break;
				809
				810	case PARSE_TAG:
				811	{
				812	if (c == u'#') {
				813	parseState = PARSE_COMMENT;
				814	savedState = PARSE_TAG;
				815	break;
				816	}
				817	if (u_isUWhiteSpace(c)) {
				818	break;
				819	}
				820	if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
				821	delete tp.bi;
				822	tp.bi = BreakIterator::createWordInstance(locale, status);
				823	skipTest = false;
				824	charIdx += 5;
				825	break;
				826	}
				827	if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
				828	delete tp.bi;
				829	tp.bi = BreakIterator::createCharacterInstance(locale, status);
				830	skipTest = false;
				831	charIdx += 5;
				832	break;
				833	}
				834	if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
				835	delete tp.bi;
				836	tp.bi = BreakIterator::createLineInstance(locale, status);
				837	skipTest = false;
				838	charIdx += 5;
				839	break;
				840	}
				841	if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
				842	delete tp.bi;
				843	tp.bi = BreakIterator::createSentenceInstance(locale, status);
				844	skipTest = false;
				845	charIdx += 5;
				846	break;
				847	}
				848	if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
				849	delete tp.bi;
				850	tp.bi = BreakIterator::createTitleInstance(locale, status);
				851	charIdx += 6;
				852	break;
				853	}
				854
				855	if (testString.compare(charIdx-1, 7, u"<rules>") == 0 \|\|
				856	testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
				857	charIdx = testString.indexOf(u'>', charIdx) + 1;
				858	parseState = PARSE_RULES;
				859	rules.remove();
				860	rulesFirstLine = lineNum;
				861	break;
				862	}
				863
				864	// <locale loc_name>
				865	localeMatcher.reset(testString);
				866	if (localeMatcher.lookingAt(charIdx-1, status)) {
				867	UnicodeString localeName = localeMatcher.group(1, status);
				868	char localeName8[100];
				869	localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
				870	locale = Locale::createFromName(localeName8);
				871	charIdx += localeMatcher.group(0, status).length() - 1;
				872	TEST_ASSERT_SUCCESS(status);
				873	break;
				874	}
				875	if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
				876	parseState = PARSE_DATA;
				877	charIdx += 5;
				878	tp.dataToBreak = "";
				879	tp.expectedBreaks->removeAllElements();
				880	tp.srcCol ->removeAllElements();
				881	tp.srcLine->removeAllElements();
				882	break;
				883	}
				884
				885	errln("line %d: Tag expected in test file.", lineNum);
				886	parseState = PARSE_COMMENT;
				887	savedState = PARSE_DATA;
				888	goto end_test; // Stop the test.
				889	}
				890	break;
				891
				892	case PARSE_RULES:
				893	if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
				894	charIdx += 7;
				895	parseState = PARSE_TAG;
				896	delete tp.bi;
				897	UParseError pe;
				898	tp.bi = new RuleBasedBreakIterator(rules, pe, status);
				899	skipTest = U_FAILURE(status);
				900	if (U_FAILURE(status)) {
				901	errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
				902	rulesFirstLine + pe.line - 1, u_errorName(status));
				903	}
				904	} else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
				905	charIdx += 10;
				906	parseState = PARSE_TAG;
				907	UErrorCode ec = U_ZERO_ERROR;
				908	UParseError pe;
				909	RuleBasedBreakIterator bi(rules, pe, ec);
				910	if (U_SUCCESS(ec)) {
				911	errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
				912	rulesFirstLine + pe.line - 1);
				913	}
				914	} else {
				915	rules.append(c);
				916	}
				917	break;
				918
				919	case PARSE_DATA:
				920	if (c == u'•') {
				921	int32_t breakIdx = tp.dataToBreak.length();
				922	if (tp.expectedBreaks->size() > breakIdx) {
				923	errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
				924	lineNum, column);
				925	}
				926	tp.expectedBreaks->setSize(breakIdx+1);
				927	tp.expectedBreaks->setElementAt(-1, breakIdx);
				928	tp.srcLine->setSize(breakIdx+1);
				929	tp.srcLine->setElementAt(lineNum, breakIdx);
				930	tp.srcCol ->setSize(breakIdx+1);
				931	tp.srcCol ->setElementAt(column, breakIdx);
				932	break;
				933	}
				934
				935	if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
				936	// Add final entry to mappings from break location to source file position.
				937	// Need one extra because last break position returned is after the
				938	// last char in the data, not at the last char.
				939	tp.srcLine->addElement(lineNum, status);
				940	tp.srcCol ->addElement(column, status);
				941
				942	parseState = PARSE_TAG;
				943	charIdx += 6;
				944
				945	if (!skipTest) {
				946	// RUN THE TEST!
				947	status = U_ZERO_ERROR;
				948	tp.setUTF16(status);
				949	executeTest(&tp, status);
				950	TEST_ASSERT_SUCCESS(status);
				951
				952	// Run again, this time with UTF-8 text wrapped in a UText.
				953	status = U_ZERO_ERROR;
				954	tp.setUTF8(status);
				955	TEST_ASSERT_SUCCESS(status);
				956	executeTest(&tp, status);
				957	}
				958	break;
				959	}
				960
				961	if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
				962	// Named character, e.g. \N{COMBINING GRAVE ACCENT}
				963	// Get the code point from the name and insert it into the test data.
				964	// (Damn, no API takes names in Unicode !!!
				965	// we've got to take it back to char *)
				966	int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
				967	int32_t nameLength = nameEndIdx - (charIdx+2);
				968	char charNameBuf[200];
				969	UChar32 theChar = -1;
				970	if (nameEndIdx != -1) {
				971	UErrorCode status = U_ZERO_ERROR;
				972	testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
				973	charNameBuf[sizeof(charNameBuf)-1] = 0;
				974	theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
				975	if (U_FAILURE(status)) {
				976	theChar = -1;
				977	}
				978	}
				979	if (theChar == -1) {
				980	errln("Error in named character in test file at line %d, col %d",
				981	lineNum, column);
				982	} else {
				983	// Named code point was recognized. Insert it
				984	// into the test data.
				985	tp.dataToBreak.append(theChar);
				986	while (tp.dataToBreak.length() > tp.srcLine->size()) {
				987	tp.srcLine->addElement(lineNum, status);
				988	tp.srcCol ->addElement(column, status);
				989	}
				990	}
				991	if (nameEndIdx > charIdx) {
				992	charIdx = nameEndIdx+1;
				993
				994	}
				995	break;
				996	}
				997
				998
				999
				1000	if (testString.compare(charIdx-1, 2, u"<>") == 0) {
				1001	charIdx++;
				1002	int32_t breakIdx = tp.dataToBreak.length();
				1003	tp.expectedBreaks->setSize(breakIdx+1);
				1004	tp.expectedBreaks->setElementAt(-1, breakIdx);
				1005	tp.srcLine->setSize(breakIdx+1);
				1006	tp.srcLine->setElementAt(lineNum, breakIdx);
				1007	tp.srcCol ->setSize(breakIdx+1);
				1008	tp.srcCol ->setElementAt(column, breakIdx);
				1009	break;
				1010	}
				1011
				1012	if (c == u'<') {
				1013	tagValue = 0;
				1014	parseState = PARSE_NUM;
				1015	break;
				1016	}
				1017
				1018	if (c == u'#' && column==3) { // TODO: why is column off so far?
				1019	parseState = PARSE_COMMENT;
				1020	savedState = PARSE_DATA;
				1021	break;
				1022	}
				1023
				1024	if (c == u'\\') {
				1025	// Check for \ at end of line, a line continuation.
				1026	// Advance over (discard) the newline
				1027	UChar32 cp = testString.char32At(charIdx);
				1028	if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
				1029	// We have a CR LF
				1030	// Need an extra increment of the input ptr to move over both of them
				1031	charIdx++;
				1032	}
				1033	if (cp == u'\n' \|\| cp == u'\r') {
				1034	lineNum++;
				1035	colStart = charIdx;
				1036	charIdx++;
				1037	break;
				1038	}
				1039
				1040	// Let unescape handle the back slash.
				1041	cp = testString.unescapeAt(charIdx);
				1042	if (cp != -1) {
				1043	// Escape sequence was recognized. Insert the char
				1044	// into the test data.
				1045	tp.dataToBreak.append(cp);
				1046	while (tp.dataToBreak.length() > tp.srcLine->size()) {
				1047	tp.srcLine->addElement(lineNum, status);
				1048	tp.srcCol ->addElement(column, status);
				1049	}
				1050	break;
				1051	}
				1052
				1053
				1054	// Not a recognized backslash escape sequence.
				1055	// Take the next char as a literal.
				1056	// TODO: Should this be an error?
				1057	c = testString.charAt(charIdx);
				1058	charIdx = testString.moveIndex32(charIdx, 1);
				1059	}
				1060
				1061	// Normal, non-escaped data char.
				1062	tp.dataToBreak.append(c);
				1063
				1064	// Save the mapping from offset in the data to line/column numbers in
				1065	// the original input file. Will be used for better error messages only.
				1066	// If there's an expected break before this char, the slot in the mapping
				1067	// vector will already be set for this char; don't overwrite it.
				1068	if (tp.dataToBreak.length() > tp.srcLine->size()) {
				1069	tp.srcLine->addElement(lineNum, status);
				1070	tp.srcCol ->addElement(column, status);
				1071	}
				1072	break;
				1073
				1074
				1075	case PARSE_NUM:
				1076	// We are parsing an expected numeric tag value, like <1234>,
				1077	// within a chunk of data.
				1078	if (u_isUWhiteSpace(c)) {
				1079	break;
				1080	}
				1081
				1082	if (c == u'>') {
				1083	// Finished the number. Add the info to the expected break data,
				1084	// and switch parse state back to doing plain data.
				1085	parseState = PARSE_DATA;
				1086	if (tagValue == 0) {
				1087	tagValue = -1;
				1088	}
				1089	int32_t breakIdx = tp.dataToBreak.length();
				1090	if (tp.expectedBreaks->size() > breakIdx) {
				1091	errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
				1092	lineNum, column);
				1093	}
				1094	tp.expectedBreaks->setSize(breakIdx+1);
				1095	tp.expectedBreaks->setElementAt(tagValue, breakIdx);
				1096	tp.srcLine->setSize(breakIdx+1);
				1097	tp.srcLine->setElementAt(lineNum, breakIdx);
				1098	tp.srcCol ->setSize(breakIdx+1);
				1099	tp.srcCol ->setElementAt(column, breakIdx);
				1100	break;
				1101	}
				1102
				1103	if (u_isdigit(c)) {
				1104	tagValue = tagValue*10 + u_charDigitValue(c);
				1105	break;
				1106	}
				1107
				1108	errln("Syntax Error in test file at line %d, col %d",
				1109	lineNum, column);
				1110	parseState = PARSE_COMMENT;
				1111	goto end_test; // Stop the test
				1112	break;
				1113	}
				1114
				1115
				1116	if (U_FAILURE(status)) {
				1117	dataerrln("ICU Error %s while parsing test file at line %d.",
				1118	u_errorName(status), lineNum);
				1119	status = U_ZERO_ERROR;
				1120	goto end_test; // Stop the test
				1121	}
				1122
				1123	}
				1124
				1125	// Reached end of test file. Raise an error if parseState indicates that we are
				1126	// within a block that should have been terminated.
				1127
				1128	if (parseState == PARSE_RULES) {
				1129	errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
				1130	lineNum, rulesFirstLine);
				1131	}
				1132	if (parseState == PARSE_DATA) {
				1133	errln("rbbitst.txt:%d <data> block not closed.", lineNum);
				1134	}
				1135
				1136
				1137	end_test:
				1138	delete [] testFile;
				1139	#endif
				1140	}
				1141
				1142	//-------------------------------------------------------------------------------
				1143	//
				1144	// TestDictRules create a break iterator from source rules that includes a
				1145	// dictionary range. Regression for bug #7130. Source rules
				1146	// do not declare a break iterator type (word, line, sentence, etc.
				1147	// but the dictionary code, without a type, would loop.
				1148	//
				1149	//-------------------------------------------------------------------------------
				1150	void RBBITest::TestDictRules() {
				1151	const char *rules = "$dictionary = [a-z]; \n"
				1152	"!!forward; \n"
				1153	"$dictionary $dictionary; \n"
				1154	"!!reverse; \n"
				1155	"$dictionary $dictionary; \n";
				1156	const char *text = "aa";
				1157	UErrorCode status = U_ZERO_ERROR;
				1158	UParseError parseError;
				1159
				1160	RuleBasedBreakIterator bi(rules, parseError, status);
				1161	if (U_SUCCESS(status)) {
				1162	UnicodeString utext = text;
				1163	bi.setText(utext);
				1164	int32_t position;
				1165	int32_t loops;
				1166	for (loops = 0; loops<10; loops++) {
				1167	position = bi.next();
				1168	if (position == RuleBasedBreakIterator::DONE) {
				1169	break;
				1170	}
				1171	}
				1172	TEST_ASSERT(loops == 1);
				1173	} else {
				1174	dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
				1175	}
				1176	}
				1177
				1178
				1179
				1180	//--------------------------------------------------------------------------------------------
				1181	//
				1182	// Run tests from each of the boundary test data files distributed by the Unicode Consortium
				1183	//
				1184	//-------------------------------------------------------------------------------------------
				1185	void RBBITest::TestUnicodeFiles() {
				1186	RuleBasedBreakIterator *bi;
				1187	UErrorCode status = U_ZERO_ERROR;
				1188
				1189	bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
				1190	TEST_ASSERT_SUCCESS(status);
				1191	if (U_SUCCESS(status)) {
				1192	runUnicodeTestData("GraphemeBreakTest.txt", bi);
				1193	}
				1194	delete bi;
				1195
				1196	bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
				1197	TEST_ASSERT_SUCCESS(status);
				1198	if (U_SUCCESS(status)) {
				1199	runUnicodeTestData("WordBreakTest.txt", bi);
				1200	}
				1201	delete bi;
				1202
				1203	bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
				1204	TEST_ASSERT_SUCCESS(status);
				1205	if (U_SUCCESS(status)) {
				1206	runUnicodeTestData("SentenceBreakTest.txt", bi);
				1207	}
				1208	delete bi;
				1209
				1210	bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
				1211	TEST_ASSERT_SUCCESS(status);
				1212	if (U_SUCCESS(status)) {
				1213	runUnicodeTestData("LineBreakTest.txt", bi);
				1214	}
				1215	delete bi;
				1216	}
				1217
				1218
				1219	// Check for test cases from the Unicode test data files that are known to fail
				1220	// and should be skipped as known issues because ICU does not fully implement
				1221	// the Unicode specifications, or because ICU includes tailorings that differ from
				1222	// the Unicode standard.
				1223	//
				1224	// Test cases are identified by the test data sequence, which tends to be more stable
				1225	// across Unicode versions than the test file line numbers.
				1226	//
				1227	// The test case with ticket "10666" is a dummy, included as an example.
				1228
				1229	UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
				1230	static struct TestCase {
				1231	const char *fTicketNum;
				1232	const char *fFileName;
				1233	const UChar *fString;
				1234	} badTestCases[] = {
				1235	{"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
				1236	// The following tests were originally for
				1237	// Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
				1238	// However, that ticket has been closed as fixed but these tests still fail, so
				1239	// ICU-21097 has been created to investigate and address these remaining issues.
				1240	{"21097", "LineBreakTest.txt", u"-#"},
				1241	{"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
				1242	{"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
				1243	{"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
				1244	{"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
				1245	{"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
				1246	{"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
				1247	{"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
				1248
				1249	// The following tests were originally for
				1250	// Issue ICU-12017 Improve line break around numbers.
				1251	// However, that ticket has been closed as fixed but these tests still fail, so
				1252	// ICU-21097 has been created to investigate and address these remaining issues.
				1253	{"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
				1254	{"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
				1255	{"21097", "LineBreakTest.txt", u"equals .35 cents"},
				1256	{"21097", "LineBreakTest.txt", u"a.2 "},
				1257	{"21097", "LineBreakTest.txt", u"a.2 \u0915"},
				1258	{"21097", "LineBreakTest.txt", u"a.2 \u672C"},
				1259	{"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
				1260	{"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
				1261	{"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
				1262	{"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
				1263	{"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
				1264	{"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
				1265	{"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
				1266	{"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1267
				1268	// ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
				1269	// need to skip some tests in WordBreakTest.txt
				1270	{"22127", "WordBreakTest.txt", u"a:"},
				1271	{"22127", "WordBreakTest.txt", u"A:"},
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1272	};
				1273
				1274	for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
				1275	const TestCase &badCase = badTestCases[n];
				1276	if (!strcmp(fileName, badCase.fFileName) &&
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1277	testCase.startsWith(UnicodeString(badCase.fString))) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1278	return logKnownIssue(badCase.fTicketNum);
				1279	}
				1280	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1281	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1282	}
				1283
				1284
				1285	//--------------------------------------------------------------------------------------------
				1286	//
				1287	// Run tests from one of the boundary test data files distributed by the Unicode Consortium
				1288	//
				1289	//-------------------------------------------------------------------------------------------
				1290	void RBBITest::runUnicodeTestData(const char fileName, RuleBasedBreakIterator bi) {
				1291	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				1292	UErrorCode status = U_ZERO_ERROR;
				1293
				1294	//
				1295	// Open and read the test data file, put it into a UnicodeString.
				1296	//
				1297	const char *testDataDirectory = IntlTest::getSourceTestData(status);
				1298	char testFileName[1000];
				1299	if (testDataDirectory == NULL \|\| strlen(testDataDirectory) >= sizeof(testFileName)) {
				1300	dataerrln("Can't open test data. Path too long.");
				1301	return;
				1302	}
				1303	strcpy(testFileName, testDataDirectory);
				1304	strcat(testFileName, fileName);
				1305
				1306	logln("Opening data file %s\n", fileName);
				1307
				1308	int len;
				1309	UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
				1310	if (status != U_FILE_ACCESS_ERROR) {
				1311	TEST_ASSERT_SUCCESS(status);
				1312	TEST_ASSERT(testFile != NULL);
				1313	}
				1314	if (U_FAILURE(status) \|\| testFile == NULL) {
				1315	return; /* something went wrong, error already output */
				1316	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1317	UnicodeString testFileAsString(true, testFile, len);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1318
				1319	//
				1320	// Parse the test data file using a regular expression.
				1321	// Each kind of token is recognized in its own capture group; what type of item was scanned
				1322	// is identified by which group had a match.
				1323	//
				1324	// Capture Group # 1 2 3 4 5
				1325	// Parses this item: divide x hex digits comment \n unrecognized \n
				1326	//
				1327	UnicodeString tokenExpr("[ \t](?:(\\u00F7)\|(\\u00D7)\|([0-9a-fA-F]+)\|((?:#.?)?$.)\|(.*?$.))", -1, US_INV);
				1328	RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE \| UREGEX_DOTALL, status);
				1329	UnicodeString testString;
				1330	UVector32 breakPositions(status);
				1331	int lineNumber = 1;
				1332	TEST_ASSERT_SUCCESS(status);
				1333	if (U_FAILURE(status)) {
				1334	return;
				1335	}
				1336
				1337	//
				1338	// Scan through each test case, building up the string to be broken in testString,
				1339	// and the positions that should be boundaries in the breakPositions vector.
				1340	//
				1341	int spin = 0;
				1342	while (tokenMatcher.find()) {
				1343	if(tokenMatcher.hitEnd()) {
				1344	/* Shouldn't Happen(TM). This means we didn't find the symbols we were looking for.
				1345	This occurred when the text file was corrupt (wasn't marked as UTF-8)
				1346	and caused an infinite loop here on EBCDIC systems!
				1347	*/
				1348	fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
				1349	// return;
				1350	}
				1351	if (tokenMatcher.start(1, status) >= 0) {
				1352	// Scanned a divide sign, indicating a break position in the test data.
				1353	if (testString.length()>0) {
				1354	breakPositions.addElement(testString.length(), status);
				1355	}
				1356	}
				1357	else if (tokenMatcher.start(2, status) >= 0) {
				1358	// Scanned an 'x', meaning no break at this position in the test data
				1359	// Nothing to be done here.
				1360	}
				1361	else if (tokenMatcher.start(3, status) >= 0) {
				1362	// Scanned Hex digits. Convert them to binary, append to the character data string.
				1363	const UnicodeString &hexNumber = tokenMatcher.group(3, status);
				1364	int length = hexNumber.length();
				1365	if (length<=8) {
				1366	char buf[10];
				1367	hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
				1368	UChar32 c = (UChar32)strtol(buf, NULL, 16);
				1369	if (c<=0x10ffff) {
				1370	testString.append(c);
				1371	} else {
				1372	errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
				1373	fileName, lineNumber);
				1374	}
				1375	} else {
				1376	errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
				1377	fileName, lineNumber);
				1378	}
				1379	}
				1380	else if (tokenMatcher.start(4, status) >= 0) {
				1381	// Scanned to end of a line, possibly skipping over a comment in the process.
				1382	// If the line from the file contained test data, run the test now.
				1383	if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
				1384	checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
				1385	}
				1386
				1387	// Clear out this test case.
				1388	// The string and breakPositions vector will be refilled as the next
				1389	// test case is parsed.
				1390	testString.remove();
				1391	breakPositions.removeAllElements();
				1392	lineNumber++;
				1393	} else {
				1394	// Scanner catchall. Something unrecognized appeared on the line.
				1395	char token[16];
				1396	UnicodeString uToken = tokenMatcher.group(0, status);
				1397	uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
				1398	token[sizeof(token)-1] = 0;
				1399	errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
				1400
				1401	// Clean up, in preparation for continuing with the next line.
				1402	testString.remove();
				1403	breakPositions.removeAllElements();
				1404	lineNumber++;
				1405	}
				1406	TEST_ASSERT_SUCCESS(status);
				1407	if (U_FAILURE(status)) {
				1408	break;
				1409	}
				1410	}
				1411
				1412	delete [] testFile;
				1413	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
				1414	}
				1415
				1416	//--------------------------------------------------------------------------------------------
				1417	//
				1418	// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
				1419	// test data files. Do only a simple, forward-only check -
				1420	// this test is mostly to check that ICU and the Unicode
				1421	// data agree with each other.
				1422	//
				1423	//--------------------------------------------------------------------------------------------
				1424	void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
				1425	const UnicodeString &testString, // Text data to be broken
				1426	UVector32 *breakPositions, // Positions where breaks should be found.
				1427	RuleBasedBreakIterator *bi) {
				1428	int32_t pos; // Break Position in the test string
				1429	int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
				1430	int32_t expectedPos; // Expected break position (index into test string)
				1431
				1432	bi->setText(testString);
				1433	pos = bi->first();
				1434	pos = bi->next();
				1435
				1436	while (pos != BreakIterator::DONE) {
				1437	if (expectedI >= breakPositions->size()) {
				1438	errln("Test file \"%s\", line %d, unexpected break found at position %d",
				1439	testFileName, lineNumber, pos);
				1440	break;
				1441	}
				1442	expectedPos = breakPositions->elementAti(expectedI);
				1443	if (pos < expectedPos) {
				1444	errln("Test file \"%s\", line %d, unexpected break found at position %d",
				1445	testFileName, lineNumber, pos);
				1446	break;
				1447	}
				1448	if (pos > expectedPos) {
				1449	errln("Test file \"%s\", line %d, failed to find expected break at position %d",
				1450	testFileName, lineNumber, expectedPos);
				1451	break;
				1452	}
				1453	pos = bi->next();
				1454	expectedI++;
				1455	}
				1456
				1457	if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
				1458	errln("Test file \"%s\", line %d, failed to find expected break at position %d",
				1459	testFileName, lineNumber, breakPositions->elementAti(expectedI));
				1460	}
				1461	}
				1462
				1463
				1464
				1465	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				1466	//---------------------------------------------------------------------------------------
				1467	//
				1468	// class RBBIMonkeyKind
				1469	//
				1470	// Monkey Test for Break Iteration
				1471	// Abstract interface class. Concrete derived classes independently
				1472	// implement the break rules for different iterator types.
				1473	//
				1474	// The Monkey Test itself uses doesn't know which type of break iterator it is
				1475	// testing, but works purely in terms of the interface defined here.
				1476	//
				1477	//---------------------------------------------------------------------------------------
				1478	class RBBIMonkeyKind {
				1479	public:
				1480	// Return a UVector of UnicodeSets, representing the character classes used
				1481	// for this type of iterator.
				1482	virtual UVector *charClasses() = 0;
				1483
				1484	// Set the test text on which subsequent calls to next() will operate
				1485	virtual void setText(const UnicodeString &s) = 0;
				1486
				1487	// Find the next break position, starting from the prev break position, or from zero.
				1488	// Return -1 after reaching end of string.
				1489	virtual int32_t next(int32_t i) = 0;
				1490
				1491	// Name of each character class, parallel with charClasses. Used for debugging output
				1492	// of characters.
				1493	virtual std::vector<std::string>& characterClassNames();
				1494
				1495	void setAppliedRule(int32_t position, const char* value);
				1496
				1497	std::string getAppliedRule(int32_t position);
				1498
				1499	virtual ~RBBIMonkeyKind();
				1500	UErrorCode deferredStatus;
				1501
				1502	std::string classNameFromCodepoint(const UChar32 c);
				1503	unsigned int maxClassNameSize();
				1504
				1505	protected:
				1506	RBBIMonkeyKind();
				1507	std::vector<std::string> classNames;
				1508	std::vector<std::string> appliedRules;
				1509
				1510	// Clear `appliedRules` and fill it with empty strings in the size of test text.
				1511	void prepareAppliedRules(int32_t size );
				1512
				1513	private:
				1514
				1515	};
				1516
				1517	RBBIMonkeyKind::RBBIMonkeyKind() {
				1518	deferredStatus = U_ZERO_ERROR;
				1519	}
				1520
				1521	RBBIMonkeyKind::~RBBIMonkeyKind() {
				1522	}
				1523
				1524	std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
				1525	return classNames;
				1526	}
				1527
				1528	void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
				1529	// Remove all the information in the `appliedRules`.
				1530	appliedRules.clear();
				1531	appliedRules.resize(size + 1);
				1532	}
				1533
				1534	void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
				1535	appliedRules[position] = value;
				1536	}
				1537
				1538	std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
				1539	return appliedRules[position];
				1540	}
				1541
				1542	std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
				1543	// Simply iterate through charClasses to find character's class
				1544	for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
				1545	UnicodeSet classSet = (UnicodeSet )charClasses()->elementAt(aClassNum);
				1546	if (classSet->contains(c)) {
				1547	return classNames[aClassNum];
				1548	}
				1549	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1550	U_ASSERT(false); // This should not happen.
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1551	return "bad class name";
				1552	}
				1553
				1554	unsigned int RBBIMonkeyKind::maxClassNameSize() {
				1555	unsigned int maxSize = 0;
				1556	for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
				1557	auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
				1558	if (aClassNumSize > maxSize) {
				1559	maxSize = aClassNumSize;
				1560	}
				1561	}
				1562	return maxSize;
				1563	}
				1564
				1565	//----------------------------------------------------------------------------------------
				1566	//
				1567	// Random Numbers. Similar to standard lib rand() and srand()
				1568	// Not using library to
				1569	// 1. Get same results on all platforms.
				1570	// 2. Get access to current seed, to more easily reproduce failures.
				1571	//
				1572	//---------------------------------------------------------------------------------------
				1573	static uint32_t m_seed = 1;
				1574
				1575	static uint32_t m_rand()
				1576	{
				1577	m_seed = m_seed * 1103515245 + 12345;
				1578	return (uint32_t)(m_seed/65536) % 32768;
				1579	}
				1580
				1581
				1582	//------------------------------------------------------------------------------------------
				1583	//
				1584	// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
				1585	// of RBBIMonkeyKind.
				1586	//
				1587	//------------------------------------------------------------------------------------------
				1588	class RBBICharMonkey: public RBBIMonkeyKind {
				1589	public:
				1590	RBBICharMonkey();
				1591	virtual ~RBBICharMonkey();
				1592	virtual UVector *charClasses() override;
				1593	virtual void setText(const UnicodeString &s) override;
				1594	virtual int32_t next(int32_t i) override;
				1595	private:
				1596	UVector *fSets;
				1597
				1598	UnicodeSet *fCRLFSet;
				1599	UnicodeSet *fControlSet;
				1600	UnicodeSet *fExtendSet;
				1601	UnicodeSet *fZWJSet;
				1602	UnicodeSet *fRegionalIndicatorSet;
				1603	UnicodeSet *fPrependSet;
				1604	UnicodeSet *fSpacingSet;
				1605	UnicodeSet *fLSet;
				1606	UnicodeSet *fVSet;
				1607	UnicodeSet *fTSet;
				1608	UnicodeSet *fLVSet;
				1609	UnicodeSet *fLVTSet;
				1610	UnicodeSet *fHangulSet;
				1611	UnicodeSet *fExtendedPictSet;
				1612	UnicodeSet *fViramaSet;
				1613	UnicodeSet *fLinkingConsonantSet;
				1614	UnicodeSet *fExtCccZwjSet;
				1615	UnicodeSet *fAnySet;
				1616
				1617	const UnicodeString *fText;
				1618	};
				1619
				1620
				1621	RBBICharMonkey::RBBICharMonkey() {
				1622	UErrorCode status = U_ZERO_ERROR;
				1623
				1624	fText = NULL;
				1625
				1626	fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
				1627	fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
				1628	fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
				1629	fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
				1630	fRegionalIndicatorSet =
				1631	new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
				1632	fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
				1633	fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
				1634	fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
				1635	fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
				1636	fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
				1637	fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
				1638	fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
				1639	fHangulSet = new UnicodeSet();
				1640	fHangulSet->addAll(*fLSet);
				1641	fHangulSet->addAll(*fVSet);
				1642	fHangulSet->addAll(*fTSet);
				1643	fHangulSet->addAll(*fLVSet);
				1644	fHangulSet->addAll(*fLVTSet);
				1645
				1646	fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
				1647	fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
				1648	"\\p{Indic_Syllabic_Category=Virama}]", status);
				1649	fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
				1650	"\\p{Indic_Syllabic_Category=Consonant}]", status);
				1651	fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
				1652	fAnySet = new UnicodeSet(0, 0x10ffff);
				1653
				1654	// Create sets of characters, and add the names of the above character sets.
				1655	// In each new ICU release, add new names corresponding to the sets above.
				1656	fSets = new UVector(status);
				1657
				1658	// Important: Keep class names the same as the class contents.
				1659	fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
				1660	fSets->addElement(fControlSet, status); classNames.push_back("Control");
				1661	fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
				1662	fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
				1663	if (!fPrependSet->isEmpty()) {
				1664	fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
				1665	}
				1666	fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
				1667	fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
				1668	fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
				1669	fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
				1670	fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
				1671	fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
				1672	fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
				1673	fSets->addElement(fAnySet, status); classNames.push_back("Any");
				1674
				1675	if (U_FAILURE(status)) {
				1676	deferredStatus = status;
				1677	}
				1678	}
				1679
				1680
				1681	void RBBICharMonkey::setText(const UnicodeString &s) {
				1682	fText = &s;
				1683	prepareAppliedRules(s.length());
				1684	}
				1685
				1686
				1687
				1688	int32_t RBBICharMonkey::next(int32_t prevPos) {
				1689	int p0, p1, p2, p3; // Indices of the significant code points around the
				1690	// break position being tested. The candidate break
				1691	// location is before p2.
				1692
				1693	int breakPos = -1;
				1694
				1695	UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
				1696	UChar32 cBase; // for (X Extend*) patterns, the X character.
				1697
				1698	if (U_FAILURE(deferredStatus)) {
				1699	return -1;
				1700	}
				1701
				1702	// Previous break at end of string. return DONE.
				1703	if (prevPos >= fText->length()) {
				1704	return -1;
				1705	}
				1706
				1707	p0 = p1 = p2 = p3 = prevPos;
				1708	c3 = fText->char32At(prevPos);
				1709	c0 = c1 = c2 = cBase = 0;
				1710	(void)p0; // suppress set but not used warning.
				1711	(void)c0;
				1712
				1713	// Loop runs once per "significant" character position in the input text.
				1714	for (;;) {
				1715	// Move all of the positions forward in the input string.
				1716	p0 = p1; c0 = c1;
				1717	p1 = p2; c1 = c2;
				1718	p2 = p3; c2 = c3;
				1719
				1720	// Advance p3 by one codepoint
				1721	p3 = fText->moveIndex32(p3, 1);
				1722	c3 = fText->char32At(p3);
				1723
				1724	if (p1 == p2) {
				1725	// Still warming up the loop. (won't work with zero length strings, but we don't care)
				1726	continue;
				1727	}
				1728
				1729	if (p2 == fText->length()) {
				1730	setAppliedRule(p2, "End of String");
				1731	break;
				1732	}
				1733
				1734	// No Extend or Format characters may appear between the CR and LF,
				1735	// which requires the additional check for p2 immediately following p1.
				1736	//
				1737	if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
				1738	setAppliedRule(p2, "GB3 CR x LF");
				1739	continue;
				1740	}
				1741
				1742	if (fControlSet->contains(c1) \|\|
				1743	c1 == 0x0D \|\|
				1744	c1 == 0x0A) {
				1745	setAppliedRule(p2, "GB4 ( Control \| CR \| LF ) <break>");
				1746	break;
				1747	}
				1748
				1749	if (fControlSet->contains(c2) \|\|
				1750	c2 == 0x0D \|\|
				1751	c2 == 0x0A) {
				1752	setAppliedRule(p2, "GB5 <break> ( Control \| CR \| LF )");
				1753	break;
				1754	}
				1755
				1756	if (fLSet->contains(c1) &&
				1757	(fLSet->contains(c2) \|\|
				1758	fVSet->contains(c2) \|\|
				1759	fLVSet->contains(c2) \|\|
				1760	fLVTSet->contains(c2))) {
				1761	setAppliedRule(p2, "GB6 L x ( L \| V \| LV \| LVT )");
				1762	continue;
				1763	}
				1764
				1765	if ((fLVSet->contains(c1) \|\| fVSet->contains(c1)) &&
				1766	(fVSet->contains(c2) \|\| fTSet->contains(c2))) {
				1767	setAppliedRule(p2, "GB7 ( LV \| V ) x ( V \| T )");
				1768	continue;
				1769	}
				1770
				1771	if ((fLVTSet->contains(c1) \|\| fTSet->contains(c1)) &&
				1772	fTSet->contains(c2)) {
				1773	setAppliedRule(p2, "GB8 ( LVT \| T) x T");
				1774	continue;
				1775	}
				1776
				1777	if (fExtendSet->contains(c2) \|\| fZWJSet->contains(c2)) {
				1778	if (!fExtendSet->contains(c1)) {
				1779	cBase = c1;
				1780	}
				1781	setAppliedRule(p2, "GB9 x (Extend \| ZWJ)");
				1782	continue;
				1783	}
				1784
				1785	if (fSpacingSet->contains(c2)) {
				1786	setAppliedRule(p2, "GB9a x SpacingMark");
				1787	continue;
				1788	}
				1789
				1790	if (fPrependSet->contains(c1)) {
				1791	setAppliedRule(p2, "GB9b Prepend x");
				1792	continue;
				1793	}
				1794
				1795	// Note: Viramas are also included in the ExtCccZwj class.
				1796	if (fLinkingConsonantSet->contains(c2)) {
				1797	int pi = p1;
				1798	bool sawVirama = false;
				1799	while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
				1800	if (fViramaSet->contains(fText->char32At(pi))) {
				1801	sawVirama = true;
				1802	}
				1803	pi = fText->moveIndex32(pi, -1);
				1804	}
				1805	if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
				1806	setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
				1807	continue;
				1808	}
				1809	}
				1810
				1811	if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
				1812	setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
				1813	continue;
				1814	}
				1815
				1816	// Note: The first if condition is a little tricky. We only need to force
				1817	// a break if there are three or more contiguous RIs. If there are
				1818	// only two, a break following will occur via other rules, and will include
				1819	// any trailing extend characters, which is needed behavior.
				1820	if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
				1821	&& fRegionalIndicatorSet->contains(c2)) {
				1822	setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
				1823	break;
				1824	}
				1825	if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
				1826	setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
				1827	continue;
				1828	}
				1829
				1830	setAppliedRule(p2, "GB999 Any <break> Any");
				1831	break;
				1832	}
				1833
				1834	breakPos = p2;
				1835	return breakPos;
				1836	}
				1837
				1838
				1839
				1840	UVector *RBBICharMonkey::charClasses() {
				1841	return fSets;
				1842	}
				1843
				1844	RBBICharMonkey::~RBBICharMonkey() {
				1845	delete fSets;
				1846	delete fCRLFSet;
				1847	delete fControlSet;
				1848	delete fExtendSet;
				1849	delete fRegionalIndicatorSet;
				1850	delete fPrependSet;
				1851	delete fSpacingSet;
				1852	delete fLSet;
				1853	delete fVSet;
				1854	delete fTSet;
				1855	delete fLVSet;
				1856	delete fLVTSet;
				1857	delete fHangulSet;
				1858	delete fAnySet;
				1859	delete fZWJSet;
				1860	delete fExtendedPictSet;
				1861	delete fViramaSet;
				1862	delete fLinkingConsonantSet;
				1863	delete fExtCccZwjSet;
				1864	}
				1865
				1866	//------------------------------------------------------------------------------------------
				1867	//
				1868	// class RBBIWordMonkey Word Break specific implementation
				1869	// of RBBIMonkeyKind.
				1870	//
				1871	//------------------------------------------------------------------------------------------
				1872	class RBBIWordMonkey: public RBBIMonkeyKind {
				1873	public:
				1874	RBBIWordMonkey();
				1875	virtual ~RBBIWordMonkey();
				1876	virtual UVector *charClasses() override;
				1877	virtual void setText(const UnicodeString &s) override;
				1878	virtual int32_t next(int32_t i) override;
				1879	private:
				1880	UVector *fSets;
				1881
				1882	UnicodeSet *fCRSet;
				1883	UnicodeSet *fLFSet;
				1884	UnicodeSet *fNewlineSet;
				1885	UnicodeSet *fRegionalIndicatorSet;
				1886	UnicodeSet *fKatakanaSet;
				1887	UnicodeSet *fHebrew_LetterSet;
				1888	UnicodeSet *fALetterSet;
				1889	UnicodeSet *fSingle_QuoteSet;
				1890	UnicodeSet *fDouble_QuoteSet;
				1891	UnicodeSet *fMidNumLetSet;
				1892	UnicodeSet *fMidLetterSet;
				1893	UnicodeSet *fMidNumSet;
				1894	UnicodeSet *fNumericSet;
				1895	UnicodeSet *fFormatSet;
				1896	UnicodeSet *fOtherSet = nullptr;
				1897	UnicodeSet *fExtendSet;
				1898	UnicodeSet *fExtendNumLetSet;
				1899	UnicodeSet *fWSegSpaceSet;
				1900	UnicodeSet *fDictionarySet = nullptr;
				1901	UnicodeSet *fZWJSet;
				1902	UnicodeSet *fExtendedPictSet;
				1903
				1904	const UnicodeString *fText;
				1905	};
				1906
				1907
				1908	RBBIWordMonkey::RBBIWordMonkey()
				1909	{
				1910	UErrorCode status = U_ZERO_ERROR;
				1911
				1912	fSets = new UVector(status);
				1913
				1914	fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
				1915	fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
				1916	fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
				1917	fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
				1918	fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
				1919	fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1920	fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter} @]", status);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1921	fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
				1922	fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
				1923	fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1924	fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]", status);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1925	fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
				1926	fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
				1927	fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
				1928	fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
				1929	// There are some sc=Hani characters with WB=Extend.
				1930	// The break rules need to pick one or the other because
				1931	// Extend overlapping with something else is messy.
				1932	// For Unicode 13, we chose to keep U+16FF0 & U+16FF1
				1933	// in $Han (for $dictionary) and out of $Extend.
				1934	fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
				1935	fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
				1936
				1937	fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
				1938	fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
				1939	if(U_FAILURE(status)) {
				1940	IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
				1941	deferredStatus = status;
				1942	return;
				1943	}
				1944
				1945	fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
				1946	fDictionarySet->addAll(*fKatakanaSet);
				1947	fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
				1948
				1949	fALetterSet->removeAll(*fDictionarySet);
				1950
				1951	fOtherSet = new UnicodeSet();
				1952	if(U_FAILURE(status)) {
				1953	IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
				1954	deferredStatus = status;
				1955	return;
				1956	}
				1957
				1958	fOtherSet->complement();
				1959	fOtherSet->removeAll(*fCRSet);
				1960	fOtherSet->removeAll(*fLFSet);
				1961	fOtherSet->removeAll(*fNewlineSet);
				1962	fOtherSet->removeAll(*fKatakanaSet);
				1963	fOtherSet->removeAll(*fHebrew_LetterSet);
				1964	fOtherSet->removeAll(*fALetterSet);
				1965	fOtherSet->removeAll(*fSingle_QuoteSet);
				1966	fOtherSet->removeAll(*fDouble_QuoteSet);
				1967	fOtherSet->removeAll(*fMidLetterSet);
				1968	fOtherSet->removeAll(*fMidNumSet);
				1969	fOtherSet->removeAll(*fNumericSet);
				1970	fOtherSet->removeAll(*fExtendNumLetSet);
				1971	fOtherSet->removeAll(*fWSegSpaceSet);
				1972	fOtherSet->removeAll(*fFormatSet);
				1973	fOtherSet->removeAll(*fExtendSet);
				1974	fOtherSet->removeAll(*fRegionalIndicatorSet);
				1975	fOtherSet->removeAll(*fZWJSet);
				1976	fOtherSet->removeAll(*fExtendedPictSet);
				1977
				1978	// Inhibit dictionary characters from being tested at all.
				1979	fOtherSet->removeAll(*fDictionarySet);
				1980
				1981	// Add classes and their names
				1982	fSets->addElement(fCRSet, status); classNames.push_back("CR");
				1983	fSets->addElement(fLFSet, status); classNames.push_back("LF");
				1984	fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
				1985	fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
				1986	fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
				1987	fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
				1988	fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
				1989	fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
				1990	// Omit Katakana from fSets, which omits Katakana characters
				1991	// from the test data. They are all in the dictionary set,
				1992	// which this (old, to be retired) monkey test cannot handle.
				1993	//fSets->addElement(fKatakanaSet, status);
				1994
				1995	fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
				1996	fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
				1997	fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
				1998	fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
				1999	fSets->addElement(fFormatSet, status); classNames.push_back("Format");
				2000	fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
				2001	fSets->addElement(fOtherSet, status); classNames.push_back("Other");
				2002	fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
				2003	fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
				2004
				2005	fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
				2006	fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
				2007
				2008	if (U_FAILURE(status)) {
				2009	deferredStatus = status;
				2010	}
				2011	}
				2012
				2013	void RBBIWordMonkey::setText(const UnicodeString &s) {
				2014	fText = &s;
				2015	prepareAppliedRules(s.length());
				2016	}
				2017
				2018
				2019	int32_t RBBIWordMonkey::next(int32_t prevPos) {
				2020	int p0, p1, p2, p3; // Indices of the significant code points around the
				2021	// break position being tested. The candidate break
				2022	// location is before p2.
				2023
				2024	int breakPos = -1;
				2025
				2026	UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
				2027
				2028	if (U_FAILURE(deferredStatus)) {
				2029	return -1;
				2030	}
				2031
				2032	// Prev break at end of string. return DONE.
				2033	if (prevPos >= fText->length()) {
				2034	return -1;
				2035	}
				2036	p0 = p1 = p2 = p3 = prevPos;
				2037	c3 = fText->char32At(prevPos);
				2038	c0 = c1 = c2 = 0;
				2039	(void)p0; // Suppress set but not used warning.
				2040
				2041	// Loop runs once per "significant" character position in the input text.
				2042	for (;;) {
				2043	// Move all of the positions forward in the input string.
				2044	p0 = p1; c0 = c1;
				2045	p1 = p2; c1 = c2;
				2046	p2 = p3; c2 = c3;
				2047
				2048	// Advance p3 by X(Extend \| Format)* Rule 4
				2049	// But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
				2050	do {
				2051	p3 = fText->moveIndex32(p3, 1);
				2052	c3 = fText->char32At(p3);
				2053	if (fCRSet->contains(c2) \|\| fLFSet->contains(c2) \|\| fNewlineSet->contains(c2)) {
				2054	break;
				2055	}
				2056	}
				2057	while (fFormatSet->contains(c3) \|\| fExtendSet->contains(c3) \|\| fZWJSet->contains(c3));
				2058
				2059
				2060	if (p1 == p2) {
				2061	// Still warming up the loop. (won't work with zero length strings, but we don't care)
				2062	continue;
				2063	}
				2064
				2065	if (p2 == fText->length()) {
				2066	// Reached end of string. Always a break position.
				2067	break;
				2068	}
				2069
				2070	// No Extend or Format characters may appear between the CR and LF,
				2071	// which requires the additional check for p2 immediately following p1.
				2072	//
				2073	if (c1==0x0D && c2==0x0A) {
				2074	setAppliedRule(p2, "WB3 CR x LF");
				2075	continue;
				2076	}
				2077
				2078	if (fCRSet->contains(c1) \|\| fLFSet->contains(c1) \|\| fNewlineSet->contains(c1)) {
				2079	setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
				2080	break;
				2081	}
				2082	if (fCRSet->contains(c2) \|\| fLFSet->contains(c2) \|\| fNewlineSet->contains(c2)) {
				2083	setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
				2084	break;
				2085	}
				2086
				2087	// Not ignoring extend chars, so peek into input text to
				2088	// get the potential ZWJ, the character immediately preceding c2.
				2089	// Sloppy UChar32 indexing: p2-1 may reference trail half
				2090	// but char32At will get the full code point.
				2091	if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
				2092	setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
				2093	continue;
				2094	}
				2095
				2096	if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
				2097	setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
				2098	continue;
				2099	}
				2100
				2101	if ((fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1)) &&
				2102	(fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2))) {
				2103	setAppliedRule(p2, "WB4 (ALetter \| Hebrew_Letter) x (ALetter \| Hebrew_Letter)");
				2104	continue;
				2105	}
				2106
				2107	if ( (fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1)) &&
				2108	(fMidLetterSet->contains(c2) \|\| fMidNumLetSet->contains(c2) \|\| fSingle_QuoteSet->contains(c2)) &&
				2109	(fALetterSet->contains(c3) \|\| fHebrew_LetterSet->contains(c3))) {
				2110	setAppliedRule(p2,
				2111	"WB6 (ALetter \| Hebrew_Letter) x (MidLetter \| MidNumLet \| Single_Quote) (ALetter _Letter)");
				2112	continue;
				2113	}
				2114
				2115	if ((fALetterSet->contains(c0) \|\| fHebrew_LetterSet->contains(c0)) &&
				2116	(fMidLetterSet->contains(c1) \|\| fMidNumLetSet->contains(c1) \|\| fSingle_QuoteSet->contains(c1)) &&
				2117	(fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2))) {
				2118	setAppliedRule(p2,
				2119	"WB7 (ALetter \| Hebrew_Letter) (MidLetter \| MidNumLet \| Single_Quote) x (ALetter \| Hebrew_Letter)");
				2120	continue;
				2121	}
				2122
				2123	if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
				2124	setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
				2125	continue;
				2126	}
				2127
				2128	if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
				2129	setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
				2130	continue;
				2131	}
				2132
				2133	if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
				2134	setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
				2135	continue;
				2136	}
				2137
				2138	if (fNumericSet->contains(c1) &&
				2139	fNumericSet->contains(c2)) {
				2140	setAppliedRule(p2, "WB8 Numeric x Numeric");
				2141	continue;
				2142	}
				2143
				2144	if ((fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1)) &&
				2145	fNumericSet->contains(c2)) {
				2146	setAppliedRule(p2, "WB9 (ALetter \| Hebrew_Letter) x Numeric");
				2147	continue;
				2148	}
				2149
				2150	if (fNumericSet->contains(c1) &&
				2151	(fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2))) {
				2152	setAppliedRule(p2, "WB10 Numeric x (ALetter \| Hebrew_Letter)");
				2153	continue;
				2154	}
				2155
				2156	if (fNumericSet->contains(c0) &&
				2157	(fMidNumSet->contains(c1) \|\| fMidNumLetSet->contains(c1) \|\| fSingle_QuoteSet->contains(c1)) &&
				2158	fNumericSet->contains(c2)) {
				2159	setAppliedRule(p2, "WB11 Numeric (MidNum \| MidNumLet \| Single_Quote) x Numeric");
				2160	continue;
				2161	}
				2162
				2163	if (fNumericSet->contains(c1) &&
				2164	(fMidNumSet->contains(c2) \|\| fMidNumLetSet->contains(c2) \|\| fSingle_QuoteSet->contains(c2)) &&
				2165	fNumericSet->contains(c3)) {
				2166	setAppliedRule(p2, "WB12 Numeric x (MidNum \| MidNumLet \| SingleQuote) Numeric");
				2167	continue;
				2168	}
				2169
				2170	// Note: matches UAX 29 rules, but doesn't come into play for ICU because
				2171	// all Katakana are handled by the dictionary breaker.
				2172	if (fKatakanaSet->contains(c1) &&
				2173	fKatakanaSet->contains(c2)) {
				2174	setAppliedRule(p2, "WB13 Katakana x Katakana");
				2175	continue;
				2176	}
				2177
				2178	if ((fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1) \|\|fNumericSet->contains(c1) \|\|
				2179	fKatakanaSet->contains(c1) \|\| fExtendNumLetSet->contains(c1)) &&
				2180	fExtendNumLetSet->contains(c2)) {
				2181	setAppliedRule(p2,
				2182	"WB13a (ALetter \| Hebrew_Letter \| Numeric \| KataKana \| ExtendNumLet) x ExtendNumLet");
				2183	continue;
				2184	}
				2185
				2186	if (fExtendNumLetSet->contains(c1) &&
				2187	(fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2) \|\|
				2188	fNumericSet->contains(c2) \|\| fKatakanaSet->contains(c2))) {
				2189	setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter \| Hebrew_Letter \| Numeric \| Katakana)");
				2190	continue;
				2191	}
				2192
				2193	if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
				2194	setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
				2195	break;
				2196	}
				2197	if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
				2198	setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
				2199	continue;
				2200	}
				2201
				2202	setAppliedRule(p2, "WB999");
				2203	break;
				2204	}
				2205
				2206	breakPos = p2;
				2207	return breakPos;
				2208	}
				2209
				2210
				2211	UVector *RBBIWordMonkey::charClasses() {
				2212	return fSets;
				2213	}
				2214
				2215	RBBIWordMonkey::~RBBIWordMonkey() {
				2216	delete fSets;
				2217	delete fCRSet;
				2218	delete fLFSet;
				2219	delete fNewlineSet;
				2220	delete fKatakanaSet;
				2221	delete fHebrew_LetterSet;
				2222	delete fALetterSet;
				2223	delete fSingle_QuoteSet;
				2224	delete fDouble_QuoteSet;
				2225	delete fMidNumLetSet;
				2226	delete fMidLetterSet;
				2227	delete fMidNumSet;
				2228	delete fNumericSet;
				2229	delete fFormatSet;
				2230	delete fExtendSet;
				2231	delete fExtendNumLetSet;
				2232	delete fWSegSpaceSet;
				2233	delete fRegionalIndicatorSet;
				2234	delete fDictionarySet;
				2235	delete fOtherSet;
				2236	delete fZWJSet;
				2237	delete fExtendedPictSet;
				2238	}
				2239
				2240
				2241
				2242
				2243	//------------------------------------------------------------------------------------------
				2244	//
				2245	// class RBBISentMonkey Sentence Break specific implementation
				2246	// of RBBIMonkeyKind.
				2247	//
				2248	//------------------------------------------------------------------------------------------
				2249	class RBBISentMonkey: public RBBIMonkeyKind {
				2250	public:
				2251	RBBISentMonkey();
				2252	virtual ~RBBISentMonkey();
				2253	virtual UVector *charClasses() override;
				2254	virtual void setText(const UnicodeString &s) override;
				2255	virtual int32_t next(int32_t i) override;
				2256	private:
				2257	int moveBack(int posFrom);
				2258	int moveForward(int posFrom);
				2259	UChar32 cAt(int pos);
				2260
				2261	UVector *fSets;
				2262
				2263	UnicodeSet *fSepSet;
				2264	UnicodeSet *fFormatSet;
				2265	UnicodeSet *fSpSet;
				2266	UnicodeSet *fLowerSet;
				2267	UnicodeSet *fUpperSet;
				2268	UnicodeSet *fOLetterSet;
				2269	UnicodeSet *fNumericSet;
				2270	UnicodeSet *fATermSet;
				2271	UnicodeSet *fSContinueSet;
				2272	UnicodeSet *fSTermSet;
				2273	UnicodeSet *fCloseSet;
				2274	UnicodeSet *fOtherSet;
				2275	UnicodeSet *fExtendSet;
				2276
				2277	const UnicodeString *fText;
				2278	};
				2279
				2280	RBBISentMonkey::RBBISentMonkey()
				2281	{
				2282	UErrorCode status = U_ZERO_ERROR;
				2283
				2284	fSets = new UVector(status);
				2285
				2286	// Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
				2287	// set and made into character classes of their own. For the monkey impl,
				2288	// they remain in SEP, since Sep always appears with CR and LF in the rules.
				2289	fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
				2290	fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
				2291	fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
				2292	fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
				2293	fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
				2294	fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
				2295	fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
				2296	fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
				2297	fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
				2298	fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
				2299	fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
				2300	fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
				2301	fOtherSet = new UnicodeSet();
				2302
				2303	if(U_FAILURE(status)) {
				2304	deferredStatus = status;
				2305	return;
				2306	}
				2307
				2308	fOtherSet->complement();
				2309	fOtherSet->removeAll(*fSepSet);
				2310	fOtherSet->removeAll(*fFormatSet);
				2311	fOtherSet->removeAll(*fSpSet);
				2312	fOtherSet->removeAll(*fLowerSet);
				2313	fOtherSet->removeAll(*fUpperSet);
				2314	fOtherSet->removeAll(*fOLetterSet);
				2315	fOtherSet->removeAll(*fNumericSet);
				2316	fOtherSet->removeAll(*fATermSet);
				2317	fOtherSet->removeAll(*fSContinueSet);
				2318	fOtherSet->removeAll(*fSTermSet);
				2319	fOtherSet->removeAll(*fCloseSet);
				2320	fOtherSet->removeAll(*fExtendSet);
				2321
				2322	fSets->addElement(fSepSet, status); classNames.push_back("Sep");
				2323	fSets->addElement(fFormatSet, status); classNames.push_back("Format");
				2324	fSets->addElement(fSpSet, status); classNames.push_back("Sp");
				2325	fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
				2326	fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
				2327	fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
				2328	fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
				2329	fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
				2330	fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
				2331	fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
				2332	fSets->addElement(fCloseSet, status); classNames.push_back("Close");
				2333	fSets->addElement(fOtherSet, status); classNames.push_back("Other");
				2334	fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
				2335
				2336	if (U_FAILURE(status)) {
				2337	deferredStatus = status;
				2338	}
				2339	}
				2340
				2341
				2342
				2343	void RBBISentMonkey::setText(const UnicodeString &s) {
				2344	fText = &s;
				2345	prepareAppliedRules(s.length());
				2346	}
				2347
				2348	UVector *RBBISentMonkey::charClasses() {
				2349	return fSets;
				2350	}
				2351
				2352	// moveBack() Find the "significant" code point preceding the index i.
				2353	// Skips over ($Extend \| $Format)* .
				2354	//
				2355	int RBBISentMonkey::moveBack(int i) {
				2356	if (i <= 0) {
				2357	return -1;
				2358	}
				2359	UChar32 c;
				2360	int32_t j = i;
				2361	do {
				2362	j = fText->moveIndex32(j, -1);
				2363	c = fText->char32At(j);
				2364	}
				2365	while (j>0 &&(fFormatSet->contains(c) \|\| fExtendSet->contains(c)));
				2366	return j;
				2367
				2368	}
				2369
				2370
				2371	int RBBISentMonkey::moveForward(int i) {
				2372	if (i>=fText->length()) {
				2373	return fText->length();
				2374	}
				2375	UChar32 c;
				2376	int32_t j = i;
				2377	do {
				2378	j = fText->moveIndex32(j, 1);
				2379	c = cAt(j);
				2380	}
				2381	while (fFormatSet->contains(c) \|\| fExtendSet->contains(c));
				2382	return j;
				2383	}
				2384
				2385	UChar32 RBBISentMonkey::cAt(int pos) {
				2386	if (pos<0 \|\| pos>=fText->length()) {
				2387	return -1;
				2388	} else {
				2389	return fText->char32At(pos);
				2390	}
				2391	}
				2392
				2393	int32_t RBBISentMonkey::next(int32_t prevPos) {
				2394	int p0, p1, p2, p3; // Indices of the significant code points around the
				2395	// break position being tested. The candidate break
				2396	// location is before p2.
				2397
				2398	int breakPos = -1;
				2399
				2400	UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
				2401	UChar32 c;
				2402
				2403	if (U_FAILURE(deferredStatus)) {
				2404	return -1;
				2405	}
				2406
				2407	// Prev break at end of string. return DONE.
				2408	if (prevPos >= fText->length()) {
				2409	return -1;
				2410	}
				2411	p0 = p1 = p2 = p3 = prevPos;
				2412	c3 = fText->char32At(prevPos);
				2413	c0 = c1 = c2 = 0;
				2414	(void)p0; // Suppress set but not used warning.
				2415
				2416	// Loop runs once per "significant" character position in the input text.
				2417	for (;;) {
				2418	// Move all of the positions forward in the input string.
				2419	p0 = p1; c0 = c1;
				2420	p1 = p2; c1 = c2;
				2421	p2 = p3; c2 = c3;
				2422
				2423	// Advance p3 by X(Extend \| Format)* Rule 4
				2424	p3 = moveForward(p3);
				2425	c3 = cAt(p3);
				2426
				2427	if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
				2428	setAppliedRule(p2, "SB3 CR x LF");
				2429	continue;
				2430	}
				2431
				2432	if (fSepSet->contains(c1)) {
				2433	p2 = p1+1; // Separators don't combine with Extend or Format.
				2434
				2435	setAppliedRule(p2, "SB4 Sep <break>");
				2436	break;
				2437	}
				2438
				2439	if (p2 >= fText->length()) {
				2440	// Reached end of string. Always a break position.
				2441	setAppliedRule(p2, "SB4 Sep <break>");
				2442	break;
				2443	}
				2444
				2445	if (p2 == prevPos) {
				2446	// Still warming up the loop. (won't work with zero length strings, but we don't care)
				2447	setAppliedRule(p2, "SB4 Sep <break>");
				2448	continue;
				2449	}
				2450
				2451	if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
				2452	setAppliedRule(p2, "SB6 ATerm x Numeric");
				2453	continue;
				2454	}
				2455
				2456	if ((fUpperSet->contains(c0) \|\| fLowerSet->contains(c0)) &&
				2457	fATermSet->contains(c1) && fUpperSet->contains(c2)) {
				2458	setAppliedRule(p2, "SB7 (Upper \| Lower) ATerm x Uppper");
				2459	continue;
				2460	}
				2461
				2462	// Note: STerm \| ATerm are added to the negated part of the expression by a
				2463	// note to the Unicode 5.0 documents.
				2464	int p8 = p1;
				2465	while (fSpSet->contains(cAt(p8))) {
				2466	p8 = moveBack(p8);
				2467	}
				2468	while (fCloseSet->contains(cAt(p8))) {
				2469	p8 = moveBack(p8);
				2470	}
				2471	if (fATermSet->contains(cAt(p8))) {
				2472	p8=p2;
				2473	for (;;) {
				2474	c = cAt(p8);
				2475	if (c==-1 \|\| fOLetterSet->contains(c) \|\| fUpperSet->contains(c) \|\|
				2476	fLowerSet->contains(c) \|\| fSepSet->contains(c) \|\|
				2477	fATermSet->contains(c) \|\| fSTermSet->contains(c)) {
				2478
				2479	setAppliedRule(p2,
				2480	"SB8 ATerm Close* Sp* x (not (OLettter \| Upper \| Lower \| Sep \| STerm \| ATerm))* ");
				2481	break;
				2482	}
				2483	p8 = moveForward(p8);
				2484	}
				2485	if (fLowerSet->contains(cAt(p8))) {
				2486
				2487	setAppliedRule(p2,
				2488	"SB8 ATerm Close* Sp* x (not (OLettter \| Upper \| Lower \| Sep \| STerm \| ATerm))* ");
				2489	continue;
				2490	}
				2491	}
				2492
				2493	if (fSContinueSet->contains(c2) \|\| fSTermSet->contains(c2) \|\| fATermSet->contains(c2)) {
				2494	p8 = p1;
				2495	while (fSpSet->contains(cAt(p8))) {
				2496	p8 = moveBack(p8);
				2497	}
				2498	while (fCloseSet->contains(cAt(p8))) {
				2499	p8 = moveBack(p8);
				2500	}
				2501	c = cAt(p8);
				2502	if (fSTermSet->contains(c) \|\| fATermSet->contains(c)) {
				2503	setAppliedRule(p2, "SB8a (STerm \| ATerm) Close* Sp* x (SContinue \| STerm \| ATerm)");
				2504	continue;
				2505	}
				2506	}
				2507
				2508	int p9 = p1;
				2509	while (fCloseSet->contains(cAt(p9))) {
				2510	p9 = moveBack(p9);
				2511	}
				2512	c = cAt(p9);
				2513	if ((fSTermSet->contains(c) \|\| fATermSet->contains(c))) {
				2514	if (fCloseSet->contains(c2) \|\| fSpSet->contains(c2) \|\| fSepSet->contains(c2)) {
				2515
				2516	setAppliedRule(p2, "SB9 (STerm \| ATerm) Close* x (Close \| Sp \| Sep \| CR \| LF)");
				2517	continue;
				2518	}
				2519	}
				2520
				2521	int p10 = p1;
				2522	while (fSpSet->contains(cAt(p10))) {
				2523	p10 = moveBack(p10);
				2524	}
				2525	while (fCloseSet->contains(cAt(p10))) {
				2526	p10 = moveBack(p10);
				2527	}
				2528	if (fSTermSet->contains(cAt(p10)) \|\| fATermSet->contains(cAt(p10))) {
				2529	if (fSpSet->contains(c2) \|\| fSepSet->contains(c2)) {
				2530	setAppliedRule(p2, "SB10 (Sterm \| ATerm) Close* Sp* x (Sp \| Sep \| CR \| LF)");
				2531	continue;
				2532	}
				2533	}
				2534
				2535	int p11 = p1;
				2536	if (fSepSet->contains(cAt(p11))) {
				2537	p11 = moveBack(p11);
				2538	}
				2539	while (fSpSet->contains(cAt(p11))) {
				2540	p11 = moveBack(p11);
				2541	}
				2542	while (fCloseSet->contains(cAt(p11))) {
				2543	p11 = moveBack(p11);
				2544	}
				2545	if (fSTermSet->contains(cAt(p11)) \|\| fATermSet->contains(cAt(p11))) {
				2546	setAppliedRule(p2, "SB11 (STerm \| ATerm) Close* Sp* (Sep \| CR \| LF)? <break>");
				2547	break;
				2548	}
				2549
				2550	setAppliedRule(p2, "SB12 Any x Any");
				2551	continue;
				2552	}
				2553
				2554	breakPos = p2;
				2555	return breakPos;
				2556	}
				2557
				2558	RBBISentMonkey::~RBBISentMonkey() {
				2559	delete fSets;
				2560	delete fSepSet;
				2561	delete fFormatSet;
				2562	delete fSpSet;
				2563	delete fLowerSet;
				2564	delete fUpperSet;
				2565	delete fOLetterSet;
				2566	delete fNumericSet;
				2567	delete fATermSet;
				2568	delete fSContinueSet;
				2569	delete fSTermSet;
				2570	delete fCloseSet;
				2571	delete fOtherSet;
				2572	delete fExtendSet;
				2573	}
				2574
				2575
				2576
				2577	//-------------------------------------------------------------------------------------------
				2578	//
				2579	// RBBILineMonkey
				2580	//
				2581	//-------------------------------------------------------------------------------------------
				2582
				2583	class RBBILineMonkey: public RBBIMonkeyKind {
				2584	public:
				2585	RBBILineMonkey();
				2586	virtual ~RBBILineMonkey();
				2587	virtual UVector *charClasses() override;
				2588	virtual void setText(const UnicodeString &s) override;
				2589	virtual int32_t next(int32_t i) override;
				2590	virtual void rule9Adjust(int32_t pos, UChar32 posChar, int32_t nextPos, UChar32 *nextChar);
				2591	private:
				2592	UVector *fSets;
				2593
				2594	UnicodeSet *fBK;
				2595	UnicodeSet *fCR;
				2596	UnicodeSet *fLF;
				2597	UnicodeSet *fCM;
				2598	UnicodeSet *fNL;
				2599	UnicodeSet *fSG;
				2600	UnicodeSet *fWJ;
				2601	UnicodeSet *fZW;
				2602	UnicodeSet *fGL;
				2603	UnicodeSet *fCB;
				2604	UnicodeSet *fSP;
				2605	UnicodeSet *fB2;
				2606	UnicodeSet *fBA;
				2607	UnicodeSet *fBB;
				2608	UnicodeSet *fHH;
				2609	UnicodeSet *fHY;
				2610	UnicodeSet *fH2;
				2611	UnicodeSet *fH3;
				2612	UnicodeSet *fCL;
				2613	UnicodeSet *fCP;
				2614	UnicodeSet *fEX;
				2615	UnicodeSet *fIN;
				2616	UnicodeSet *fJL;
				2617	UnicodeSet *fJV;
				2618	UnicodeSet *fJT;
				2619	UnicodeSet *fNS;
				2620	UnicodeSet *fOP;
				2621	UnicodeSet *fQU;
				2622	UnicodeSet *fIS;
				2623	UnicodeSet *fNU;
				2624	UnicodeSet *fPO;
				2625	UnicodeSet *fPR;
				2626	UnicodeSet *fSY;
				2627	UnicodeSet *fAI;
				2628	UnicodeSet *fAL;
				2629	UnicodeSet *fCJ;
				2630	UnicodeSet *fHL;
				2631	UnicodeSet *fID;
				2632	UnicodeSet *fRI;
				2633	UnicodeSet *fXX;
				2634	UnicodeSet *fEB;
				2635	UnicodeSet *fEM;
				2636	UnicodeSet *fZWJ;
				2637	UnicodeSet *fOP30;
				2638	UnicodeSet *fCP30;
				2639	UnicodeSet *fExtPictUnassigned;
				2640
				2641	BreakIterator *fCharBI;
				2642	const UnicodeString *fText;
				2643	RegexMatcher *fNumberMatcher;
				2644	};
				2645
				2646	RBBILineMonkey::RBBILineMonkey() :
				2647	RBBIMonkeyKind(),
				2648	fSets(NULL),
				2649
				2650	fCharBI(NULL),
				2651	fText(NULL),
				2652	fNumberMatcher(NULL)
				2653
				2654	{
				2655	if (U_FAILURE(deferredStatus)) {
				2656	return;
				2657	}
				2658
				2659	UErrorCode status = U_ZERO_ERROR;
				2660
				2661	fSets = new UVector(status);
				2662
				2663	fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
				2664	fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
				2665	fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
				2666	fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
				2667	fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
				2668	fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
				2669	fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
				2670	fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
				2671	fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
				2672	fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
				2673	fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
				2674	fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
				2675	fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
				2676	fHH = new UnicodeSet();
				2677	fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
				2678	fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
				2679	fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
				2680	fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
				2681	fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
				2682	fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
				2683	fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
				2684	fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
				2685	fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
				2686	fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
				2687	fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
				2688	fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
				2689	fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
				2690	fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
				2691	fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
				2692	fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
				2693	fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
				2694	fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
				2695	fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
				2696	fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
				2697	fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
				2698	fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
				2699	fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
				2700	fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
				2701	fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
				2702	fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
				2703	fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
				2704	fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
				2705	fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
				2706	fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
				2707	fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
				2708	fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
				2709
				2710	if (U_FAILURE(status)) {
				2711	deferredStatus = status;
				2712	return;
				2713	}
				2714
				2715	fAL->addAll(*fXX); // Default behavior for XX is identical to AL
				2716	fAL->addAll(*fAI); // Default behavior for AI is identical to AL
				2717	fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
				2718
				2719	fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
				2720	fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
				2721
				2722	fHH->add(u'\u2010'); // Hyphen, '‐'
				2723
				2724	// Sets and names.
				2725	fSets->addElement(fBK, status); classNames.push_back("fBK");
				2726	fSets->addElement(fCR, status); classNames.push_back("fCR");
				2727	fSets->addElement(fLF, status); classNames.push_back("fLF");
				2728	fSets->addElement(fCM, status); classNames.push_back("fCM");
				2729	fSets->addElement(fNL, status); classNames.push_back("fNL");
				2730	fSets->addElement(fWJ, status); classNames.push_back("fWJ");
				2731	fSets->addElement(fZW, status); classNames.push_back("fZW");
				2732	fSets->addElement(fGL, status); classNames.push_back("fGL");
				2733	fSets->addElement(fCB, status); classNames.push_back("fCB");
				2734	fSets->addElement(fSP, status); classNames.push_back("fSP");
				2735	fSets->addElement(fB2, status); classNames.push_back("fB2");
				2736	fSets->addElement(fBA, status); classNames.push_back("fBA");
				2737	fSets->addElement(fBB, status); classNames.push_back("fBB");
				2738	fSets->addElement(fHY, status); classNames.push_back("fHY");
				2739	fSets->addElement(fH2, status); classNames.push_back("fH2");
				2740	fSets->addElement(fH3, status); classNames.push_back("fH3");
				2741	fSets->addElement(fCL, status); classNames.push_back("fCL");
				2742	fSets->addElement(fCP, status); classNames.push_back("fCP");
				2743	fSets->addElement(fEX, status); classNames.push_back("fEX");
				2744	fSets->addElement(fIN, status); classNames.push_back("fIN");
				2745	fSets->addElement(fJL, status); classNames.push_back("fJL");
				2746	fSets->addElement(fJT, status); classNames.push_back("fJT");
				2747	fSets->addElement(fJV, status); classNames.push_back("fJV");
				2748	fSets->addElement(fNS, status); classNames.push_back("fNS");
				2749	fSets->addElement(fOP, status); classNames.push_back("fOP");
				2750	fSets->addElement(fQU, status); classNames.push_back("fQU");
				2751	fSets->addElement(fIS, status); classNames.push_back("fIS");
				2752	fSets->addElement(fNU, status); classNames.push_back("fNU");
				2753	fSets->addElement(fPO, status); classNames.push_back("fPO");
				2754	fSets->addElement(fPR, status); classNames.push_back("fPR");
				2755	fSets->addElement(fSY, status); classNames.push_back("fSY");
				2756	fSets->addElement(fAI, status); classNames.push_back("fAI");
				2757	fSets->addElement(fAL, status); classNames.push_back("fAL");
				2758	fSets->addElement(fHL, status); classNames.push_back("fHL");
				2759	fSets->addElement(fID, status); classNames.push_back("fID");
				2760	fSets->addElement(fRI, status); classNames.push_back("fRI");
				2761	fSets->addElement(fSG, status); classNames.push_back("fSG");
				2762	fSets->addElement(fEB, status); classNames.push_back("fEB");
				2763	fSets->addElement(fEM, status); classNames.push_back("fEM");
				2764	fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
				2765	// TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
				2766	fSets->addElement(fOP30, status); classNames.push_back("fOP30");
				2767	fSets->addElement(fCP30, status); classNames.push_back("fCP30");
				2768	fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
				2769
				2770	const char *rules =
				2771	"((\\p{Line_Break=PR}\|\\p{Line_Break=PO})(\\p{Line_Break=CM}\|\\u200d)*)?"
				2772	"((\\p{Line_Break=OP}\|\\p{Line_Break=HY})(\\p{Line_Break=CM}\|\\u200d)*)?"
				2773	"((\\p{Line_Break=IS})(\\p{Line_Break=CM}\|\\u200d)*)?"
				2774	"\\p{Line_Break=NU}(\\p{Line_Break=CM}\|\\u200d)*"
				2775	"((\\p{Line_Break=NU}\|\\p{Line_Break=IS}\|\\p{Line_Break=SY})(\\p{Line_Break=CM}\|\\u200d))"
				2776	"((\\p{Line_Break=CL}\|\\p{Line_Break=CP})(\\p{Line_Break=CM}\|\\u200d)*)?"
				2777	"((\\p{Line_Break=PR}\|\\p{Line_Break=PO})(\\p{Line_Break=CM}\|\\u200d)*)?";
				2778
				2779	fNumberMatcher = new RegexMatcher(
				2780	UnicodeString(rules, -1, US_INV), 0, status);
				2781
				2782	fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
				2783
				2784	if (U_FAILURE(status)) {
				2785	deferredStatus = status;
				2786	}
				2787
				2788	}
				2789
				2790
				2791	void RBBILineMonkey::setText(const UnicodeString &s) {
				2792	fText = &s;
				2793	fCharBI->setText(s);
				2794	prepareAppliedRules(s.length());
				2795	fNumberMatcher->reset(s);
				2796	}
				2797
				2798	//
				2799	// rule9Adjust
				2800	// Line Break TR rules 9 and 10 implementation.
				2801	// This deals with combining marks and other sequences that
				2802	// that must be treated as if they were something other than what they actually are.
				2803	//
				2804	// This is factored out into a separate function because it must be applied twice for
				2805	// each potential break, once to the chars before the position being checked, then
				2806	// again to the text following the possible break.
				2807	//
				2808	void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 posChar, int32_t nextPos, UChar32 *nextChar) {
				2809	if (pos == -1) {
				2810	// Invalid initial position. Happens during the warmup iteration of the
				2811	// main loop in next().
				2812	return;
				2813	}
				2814
				2815	int32_t nPos = *nextPos;
				2816
				2817	// LB 9 Keep combining sequences together.
				2818	// advance over any CM class chars. Note that Line Break CM is different
				2819	// from the normal Grapheme Extend property.
				2820	if (!(fSP->contains(posChar) \|\| fBK->contains(posChar) \|\| *posChar==0x0d \|\|
				2821	posChar==0x0a \|\|fNL->contains(posChar) \|\| fZW->contains(*posChar))) {
				2822	for (;;) {
				2823	*nextChar = fText->char32At(nPos);
				2824	if (!fCM->contains(*nextChar)) {
				2825	break;
				2826	}
				2827	nPos = fText->moveIndex32(nPos, 1);
				2828	}
				2829	}
				2830
				2831
				2832	// LB 9 Treat X CM* as if it were x.
				2833	// No explicit action required.
				2834
				2835	// LB 10 Treat any remaining combining mark as AL
				2836	if (fCM->contains(*posChar)) {
				2837	*posChar = u'A';
				2838	}
				2839
				2840	// Push the updated nextPos and nextChar back to our caller.
				2841	// This only makes a difference if posChar got bigger by consuming a
				2842	// combining sequence.
				2843	*nextPos = nPos;
				2844	*nextChar = fText->char32At(nPos);
				2845	}
				2846
				2847
				2848
				2849	int32_t RBBILineMonkey::next(int32_t startPos) {
				2850	UErrorCode status = U_ZERO_ERROR;
				2851	int32_t pos; // Index of the char following a potential break position
				2852	UChar32 thisChar; // Character at above position "pos"
				2853
				2854	int32_t prevPos; // Index of the char preceding a potential break position
				2855	UChar32 prevChar; // Character at above position. Note that prevChar
				2856	// and thisChar may not be adjacent because combining
				2857	// characters between them will be ignored.
				2858
				2859	int32_t prevPosX2; // Second previous character. Wider context for LB21a.
				2860	UChar32 prevCharX2;
				2861
				2862	int32_t nextPos; // Index of the next character following pos.
				2863	// Usually skips over combining marks.
				2864	int32_t nextCPPos; // Index of the code point following "pos."
				2865	// May point to a combining mark.
				2866	int32_t tPos; // temp value.
				2867	UChar32 c;
				2868
				2869	if (U_FAILURE(deferredStatus)) {
				2870	return -1;
				2871	}
				2872
				2873	if (startPos >= fText->length()) {
				2874	return -1;
				2875	}
				2876
				2877
				2878	// Initial values for loop. Loop will run the first time without finding breaks,
				2879	// while the invalid values shift out and the "this" and
				2880	// "prev" positions are filled in with good values.
				2881	pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
				2882	thisChar = prevChar = prevCharX2 = 0;
				2883	nextPos = nextCPPos = startPos;
				2884
				2885
				2886	// Loop runs once per position in the test text, until a break position
				2887	// is found.
				2888	for (;;) {
				2889	prevPosX2 = prevPos;
				2890	prevCharX2 = prevChar;
				2891
				2892	prevPos = pos;
				2893	prevChar = thisChar;
				2894
				2895	pos = nextPos;
				2896	thisChar = fText->char32At(pos);
				2897
				2898	nextCPPos = fText->moveIndex32(pos, 1);
				2899	nextPos = nextCPPos;
				2900
				2901
				2902	if (pos >= fText->length()) {
				2903	setAppliedRule(pos, "LB2 - Break at end of text.");
				2904	break;
				2905	}
				2906
				2907
				2908	// We do this one out-of-order because the adjustment does not change anything
				2909	// that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
				2910	// be applied.
				2911	rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
				2912	nextCPPos = nextPos = fText->moveIndex32(pos, 1);
				2913	c = fText->char32At(nextPos);
				2914	rule9Adjust(pos, &thisChar, &nextPos, &c);
				2915
				2916	// If the loop is still warming up - if we haven't shifted the initial
				2917	// -1 positions out of prevPos yet - loop back to advance the
				2918	// position in the input without any further looking for breaks.
				2919	if (prevPos == -1) {
				2920	setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
				2921	continue;
				2922	}
				2923
				2924
				2925	if (fBK->contains(prevChar)) {
				2926	setAppliedRule(pos, "LB 4 Always break after hard line breaks");
				2927	break;
				2928	}
				2929
				2930
				2931	if (prevChar == 0x0d && thisChar == 0x0a) {
				2932	setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
				2933	continue;
				2934	}
				2935	if (prevChar == 0x0d \|\|
				2936	prevChar == 0x0a \|\|
				2937	prevChar == 0x85) {
				2938	setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
				2939	break;
				2940	}
				2941
				2942
				2943	if (thisChar == 0x0d \|\| thisChar == 0x0a \|\| thisChar == 0x85 \|\|
				2944	fBK->contains(thisChar)) {
				2945	setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
				2946	continue;
				2947	}
				2948
				2949
				2950	if (fSP->contains(thisChar)) {
				2951	setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
				2952	continue;
				2953	}
				2954
				2955	// !!! ??? Is this the right text for the applied rule?
				2956	if (fZW->contains(thisChar)) {
				2957	setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
				2958	continue;
				2959	}
				2960
				2961
				2962	// ZW SP* ÷
				2963	// Scan backwards from prevChar for SP* ZW
				2964	tPos = prevPos;
				2965	while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
				2966	tPos = fText->moveIndex32(tPos, -1);
				2967	}
				2968	if (fZW->contains(fText->char32At(tPos))) {
				2969	setAppliedRule(pos, "LB 8 Break after zero width space");
				2970	break;
				2971	}
				2972
				2973
				2974	// Move this test up, before LB8a, because numbers can match a longer sequence that would
				2975	// also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
				2976	if (fNumberMatcher->lookingAt(prevPos, status)) {
				2977	if (U_FAILURE(status)) {
				2978	setAppliedRule(pos, "LB 25 Numbers");
				2979	break;
				2980	}
				2981	// Matched a number. But could have been just a single digit, which would
				2982	// not represent a "no break here" between prevChar and thisChar
				2983	int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
				2984	if (numEndIdx > pos) {
				2985	// Number match includes at least our two chars being checked
				2986	if (numEndIdx > nextPos) {
				2987	// Number match includes additional chars. Update pos and nextPos
				2988	// so that next loop iteration will continue at the end of the number,
				2989	// checking for breaks between last char in number & whatever follows.
				2990	pos = nextPos = numEndIdx;
				2991	do {
				2992	pos = fText->moveIndex32(pos, -1);
				2993	thisChar = fText->char32At(pos);
				2994	} while (fCM->contains(thisChar));
				2995	}
				2996	setAppliedRule(pos, "LB 25 Numbers");
				2997	continue;
				2998	}
				2999	}
				3000
				3001
				3002	// The monkey test's way of ignoring combining characters doesn't work
				3003	// for this rule. ZJ is also a CM. Need to get the actual character
				3004	// preceding "thisChar", not ignoring combining marks, possibly ZJ.
				3005	{
				3006	int32_t prevIdx = fText->moveIndex32(pos, -1);
				3007	UChar32 prevC = fText->char32At(prevIdx);
				3008	if (fZWJ->contains(prevC)) {
				3009	setAppliedRule(pos, "LB 8a ZWJ x");
				3010	continue;
				3011	}
				3012	}
				3013
				3014
				3015	// appliedRule: "LB 9, 10"; // Already done, at top of loop.";
				3016	//
				3017
				3018
				3019	// x WJ
				3020	// WJ x
				3021	//
				3022	if (fWJ->contains(thisChar) \|\| fWJ->contains(prevChar)) {
				3023	setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
				3024	continue;
				3025	}
				3026
				3027
				3028	if (fGL->contains(prevChar)) {
				3029	setAppliedRule(pos, "LB 12 GL x");
				3030	continue;
				3031	}
				3032
				3033
				3034	if (!(fSP->contains(prevChar) \|\|
				3035	fBA->contains(prevChar) \|\|
				3036	fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
				3037	setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
				3038	continue;
				3039	}
				3040
				3041
				3042	if (fCL->contains(thisChar) \|\|
				3043	fCP->contains(thisChar) \|\|
				3044	fEX->contains(thisChar) \|\|
				3045	fSY->contains(thisChar)) {
				3046	setAppliedRule(pos, "LB 13 Don't break before closings.");
				3047	continue;
				3048	}
				3049
				3050
				3051	// Scan backwards, checking for this sequence.
				3052	// The OP char could include combining marks, so we actually check for
				3053	// OP CM* SP*
				3054	// Another Twist: The Rule 9 fixes may have changed a SP CM
				3055	// sequence into a ID char, so before scanning back through spaces,
				3056	// verify that prevChar is indeed a space. The prevChar variable
				3057	// may differ from fText[prevPos]
				3058	tPos = prevPos;
				3059	if (fSP->contains(prevChar)) {
				3060	while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
				3061	tPos=fText->moveIndex32(tPos, -1);
				3062	}
				3063	}
				3064	while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
				3065	tPos=fText->moveIndex32(tPos, -1);
				3066	}
				3067	if (fOP->contains(fText->char32At(tPos))) {
				3068	setAppliedRule(pos, "LB 14 Don't break after OP SP*");
				3069	continue;
				3070	}
				3071
				3072
				3073	if (nextPos < fText->length()) {
				3074	// note: UnicodeString::char32At(length) returns ffff, not distinguishable
				3075	// from a legit ffff character. So test length separately.
				3076	UChar32 nextChar = fText->char32At(nextPos);
				3077	if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
				3078	setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
				3079	break;
				3080	}
				3081	}
				3082
				3083
				3084	if (fIS->contains(thisChar)) {
				3085	setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
				3086	continue;
				3087	}
				3088
				3089
				3090	if (fOP->contains(thisChar)) {
				3091	// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
				3092	int tPos = prevPos;
				3093	while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
				3094	tPos = fText->moveIndex32(tPos, -1);
				3095	}
				3096	while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
				3097	tPos = fText->moveIndex32(tPos, -1);
				3098	}
				3099	if (fQU->contains(fText->char32At(tPos))) {
				3100	setAppliedRule(pos, "LB 15 QU SP* x OP");
				3101	continue;
				3102	}
				3103	}
				3104
				3105
				3106	// Scan backwards for SP* CM* (CL \| CP)
				3107	if (fNS->contains(thisChar)) {
				3108	int tPos = prevPos;
				3109	while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
				3110	tPos = fText->moveIndex32(tPos, -1);
				3111	}
				3112	while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
				3113	tPos = fText->moveIndex32(tPos, -1);
				3114	}
				3115	if (fCL->contains(fText->char32At(tPos)) \|\| fCP->contains(fText->char32At(tPos))) {
				3116	setAppliedRule(pos, "LB 16 (CL \| CP) SP* x NS");
				3117	continue;
				3118	}
				3119	}
				3120
				3121
				3122	if (fB2->contains(thisChar)) {
				3123	// Scan backwards, checking for the B2 CM* SP* sequence.
				3124	tPos = prevPos;
				3125	if (fSP->contains(prevChar)) {
				3126	while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
				3127	tPos=fText->moveIndex32(tPos, -1);
				3128	}
				3129	}
				3130	while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
				3131	tPos=fText->moveIndex32(tPos, -1);
				3132	}
				3133	if (fB2->contains(fText->char32At(tPos))) {
				3134	setAppliedRule(pos, "LB 17 B2 SP* x B2");
				3135	continue;
				3136	}
				3137	}
				3138
				3139
				3140	if (fSP->contains(prevChar)) {
				3141	setAppliedRule(pos, "LB 18 break after space");
				3142	break;
				3143	}
				3144
				3145	// x QU
				3146	// QU x
				3147	if (fQU->contains(thisChar) \|\| fQU->contains(prevChar)) {
				3148	setAppliedRule(pos, "LB 19");
				3149	continue;
				3150	}
				3151
				3152	if (fCB->contains(thisChar) \|\| fCB->contains(prevChar)) {
				3153	setAppliedRule(pos, "LB 20 Break around a CB");
				3154	break;
				3155	}
				3156
				3157	// Don't break between Hyphens and letters if a break precedes the hyphen.
				3158	// Formerly this was a Finnish tailoring.
				3159	// Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
				3160	// ^($HY \| $HH) $AL;
				3161	if (fAL->contains(thisChar) && (fHY->contains(prevChar) \|\| fHH->contains(prevChar)) &&
				3162	prevPosX2 == -1) {
				3163	setAppliedRule(pos, "LB 20.09");
				3164	continue;
				3165	}
				3166
				3167	if (fBA->contains(thisChar) \|\|
				3168	fHY->contains(thisChar) \|\|
				3169	fNS->contains(thisChar) \|\|
				3170	fBB->contains(prevChar) ) {
				3171	setAppliedRule(pos, "LB 21");
				3172	continue;
				3173	}
				3174
				3175	if (fHL->contains(prevCharX2) &&
				3176	(fHY->contains(prevChar) \|\| fBA->contains(prevChar))) {
				3177	setAppliedRule(pos, "LB 21a HL (HY \| BA) x");
				3178	continue;
				3179	}
				3180
				3181	if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
				3182	setAppliedRule(pos, "LB 21b SY x HL");
				3183	continue;
				3184	}
				3185
				3186	if (fIN->contains(thisChar)) {
				3187	setAppliedRule(pos, "LB 22");
				3188	continue;
				3189	}
				3190
				3191
				3192	// (AL \| HL) x NU
				3193	// NU x (AL \| HL)
				3194	if ((fAL->contains(prevChar) \|\| fHL->contains(prevChar)) && fNU->contains(thisChar)) {
				3195	setAppliedRule(pos, "LB 23");
				3196	continue;
				3197	}
				3198	if (fNU->contains(prevChar) && (fAL->contains(thisChar) \|\| fHL->contains(thisChar))) {
				3199	setAppliedRule(pos, "LB 23");
				3200	continue;
				3201	}
				3202
				3203	// Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
				3204	// PR x (ID \| EB \| EM)
				3205	// (ID \| EB \| EM) x PO
				3206	if (fPR->contains(prevChar) &&
				3207	(fID->contains(thisChar) \|\| fEB->contains(thisChar) \|\| fEM->contains(thisChar))) {
				3208	setAppliedRule(pos, "LB 23a");
				3209	continue;
				3210	}
				3211	if ((fID->contains(prevChar) \|\| fEB->contains(prevChar) \|\| fEM->contains(prevChar)) &&
				3212	fPO->contains(thisChar)) {
				3213	setAppliedRule(pos, "LB 23a");
				3214	continue;
				3215	}
				3216
				3217	// Do not break between prefix and letters or ideographs.
				3218	// (PR \| PO) x (AL \| HL)
				3219	// (AL \| HL) x (PR \| PO)
				3220	if ((fPR->contains(prevChar) \|\| fPO->contains(prevChar)) &&
				3221	(fAL->contains(thisChar) \|\| fHL->contains(thisChar))) {
				3222	setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
				3223	continue;
				3224	}
				3225	if ((fAL->contains(prevChar) \|\| fHL->contains(prevChar)) &&
				3226	(fPR->contains(thisChar) \|\| fPO->contains(thisChar))) {
				3227	setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
				3228	continue;
				3229	}
				3230
				3231	// appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
				3232
				3233	if (fJL->contains(prevChar) && (fJL->contains(thisChar) \|\|
				3234	fJV->contains(thisChar) \|\|
				3235	fH2->contains(thisChar) \|\|
				3236	fH3->contains(thisChar))) {
				3237	setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
				3238	continue;
				3239	}
				3240
				3241	if ((fJV->contains(prevChar) \|\| fH2->contains(prevChar)) &&
				3242	(fJV->contains(thisChar) \|\| fJT->contains(thisChar))) {
				3243	setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
				3244	continue;
				3245	}
				3246
				3247	if ((fJT->contains(prevChar) \|\| fH3->contains(prevChar)) &&
				3248	fJT->contains(thisChar)) {
				3249	setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
				3250	continue;
				3251	}
				3252
				3253	if ((fJL->contains(prevChar) \|\| fJV->contains(prevChar) \|\|
				3254	fJT->contains(prevChar) \|\| fH2->contains(prevChar) \|\| fH3->contains(prevChar)) &&
				3255	fPO->contains(thisChar)) {
				3256	setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
				3257	continue;
				3258	}
				3259	if (fPR->contains(prevChar) && (fJL->contains(thisChar) \|\| fJV->contains(thisChar) \|\|
				3260	fJT->contains(thisChar) \|\| fH2->contains(thisChar) \|\| fH3->contains(thisChar))) {
				3261	setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
				3262	continue;
				3263	}
				3264
				3265
				3266
				3267	if ((fAL->contains(prevChar) \|\| fHL->contains(prevChar)) && (fAL->contains(thisChar) \|\| fHL->contains(thisChar))) {
				3268	setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
				3269	continue;
				3270	}
				3271
				3272	if (fIS->contains(prevChar) && (fAL->contains(thisChar) \|\| fHL->contains(thisChar))) {
				3273	setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
				3274	continue;
				3275	}
				3276
				3277	// (AL \| NU) x OP
				3278	// CP x (AL \| NU)
				3279	if ((fAL->contains(prevChar) \|\| fHL->contains(prevChar) \|\| fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
				3280	setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
				3281	continue;
				3282	}
				3283	if (fCP30->contains(prevChar) && (fAL->contains(thisChar) \|\| fHL->contains(thisChar) \|\| fNU->contains(thisChar))) {
				3284	setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
				3285	continue;
				3286	}
				3287
				3288	// RI x RI
				3289	if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
				3290	setAppliedRule(pos, "LB30a RI RI ÷ RI");
				3291	break;
				3292	}
				3293	if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
				3294	// Two Regional Indicators have been paired.
				3295	// Over-write the trailing one (thisChar) to prevent it from forming another pair with a
				3296	// following RI. This is a hack.
				3297	thisChar = -1;
				3298	setAppliedRule(pos, "LB30a RI RI ÷ RI");
				3299	continue;
				3300	}
				3301
				3302	// LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
				3303	if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
				3304	setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
				3305	continue;
				3306	}
				3307
				3308	if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
				3309	setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
				3310	continue;
				3311	}
				3312
				3313	setAppliedRule(pos, "LB 31 Break everywhere else");
				3314	break;
				3315	}
				3316
				3317	return pos;
				3318	}
				3319
				3320
				3321	UVector *RBBILineMonkey::charClasses() {
				3322	return fSets;
				3323	}
				3324
				3325
				3326	RBBILineMonkey::~RBBILineMonkey() {
				3327	delete fSets;
				3328
				3329	delete fBK;
				3330	delete fCR;
				3331	delete fLF;
				3332	delete fCM;
				3333	delete fNL;
				3334	delete fWJ;
				3335	delete fZW;
				3336	delete fGL;
				3337	delete fCB;
				3338	delete fSP;
				3339	delete fB2;
				3340	delete fBA;
				3341	delete fBB;
				3342	delete fHH;
				3343	delete fHY;
				3344	delete fH2;
				3345	delete fH3;
				3346	delete fCL;
				3347	delete fCP;
				3348	delete fEX;
				3349	delete fIN;
				3350	delete fJL;
				3351	delete fJV;
				3352	delete fJT;
				3353	delete fNS;
				3354	delete fOP;
				3355	delete fQU;
				3356	delete fIS;
				3357	delete fNU;
				3358	delete fPO;
				3359	delete fPR;
				3360	delete fSY;
				3361	delete fAI;
				3362	delete fAL;
				3363	delete fCJ;
				3364	delete fHL;
				3365	delete fID;
				3366	delete fRI;
				3367	delete fSG;
				3368	delete fXX;
				3369	delete fEB;
				3370	delete fEM;
				3371	delete fZWJ;
				3372	delete fOP30;
				3373	delete fCP30;
				3374	delete fExtPictUnassigned;
				3375
				3376	delete fCharBI;
				3377	delete fNumberMatcher;
				3378	}
				3379
				3380
				3381	//-------------------------------------------------------------------------------------------
				3382	//
				3383	// TestMonkey
				3384	//
				3385	// params
				3386	// seed=nnnnn Random number starting seed.
				3387	// Setting the seed allows errors to be reproduced.
				3388	// loop=nnn Looping count. Controls running time.
				3389	// -1: run forever.
				3390	// 0 or greater: run length.
				3391	//
				3392	// type = char \| word \| line \| sent \| title
				3393	//
				3394	// Example:
				3395	// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
				3396	//
				3397	//-------------------------------------------------------------------------------------------
				3398
				3399	static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
				3400	int32_t val = defaultVal;
				3401	name.append(" = (-?\\d+)");
				3402	UErrorCode status = U_ZERO_ERROR;
				3403	RegexMatcher m(name, params, 0, status);
				3404	if (m.find()) {
				3405	// The param exists. Convert the string to an int.
				3406	char valString[100];
				3407	int32_t paramLength = m.end(1, status) - m.start(1, status);
				3408	if (paramLength >= (int32_t)(sizeof(valString)-1)) {
				3409	paramLength = (int32_t)(sizeof(valString)-2);
				3410	}
				3411	params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
				3412	val = strtol(valString, NULL, 10);
				3413
				3414	// Delete this parameter from the params string.
				3415	m.reset();
				3416	params = m.replaceFirst("", status);
				3417	}
				3418	U_ASSERT(U_SUCCESS(status));
				3419	return val;
				3420	}
				3421	#endif
				3422
				3423	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				3424	static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
				3425	BreakIterator *bi,
				3426	int expected[],
				3427	int expectedcount)
				3428	{
				3429	int count = 0;
				3430	int i = 0;
				3431	int forward[50];
				3432	bi->setText(ustr);
				3433	for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
				3434	forward[count] = i;
				3435	if (count < expectedcount && expected[count] != i) {
				3436	test->errln("%s:%d break forward test failed: expected %d but got %d",
				3437	__FILE__, __LINE__, expected[count], i);
				3438	break;
				3439	}
				3440	count ++;
				3441	}
				3442	if (count != expectedcount) {
				3443	printStringBreaks(ustr, expected, expectedcount);
				3444	test->errln("%s:%d break forward test failed: missed %d match",
				3445	__FILE__, __LINE__, expectedcount - count);
				3446	return;
				3447	}
				3448	// testing boundaries
				3449	for (i = 1; i < expectedcount; i ++) {
				3450	int j = expected[i - 1];
				3451	if (!bi->isBoundary(j)) {
				3452	printStringBreaks(ustr, expected, expectedcount);
				3453	test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
				3454	__FILE__, __LINE__, j);
				3455	return;
				3456	}
				3457	for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
				3458	if (bi->isBoundary(j)) {
				3459	printStringBreaks(ustr, expected, expectedcount);
				3460	test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
				3461	__FILE__, __LINE__, j);
				3462	return;
				3463	}
				3464	}
				3465	}
				3466
				3467	for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
				3468	count --;
				3469	if (forward[count] != i) {
				3470	printStringBreaks(ustr, expected, expectedcount);
				3471	test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
				3472	__FILE__, __LINE__, forward[count], i);
				3473	break;
				3474	}
				3475	}
				3476	if (count != 0) {
				3477	printStringBreaks(ustr, expected, expectedcount);
				3478	test->errln("break test previous() failed: missed a match");
				3479	return;
				3480	}
				3481
				3482	// testing preceding
				3483	for (i = 0; i < expectedcount - 1; i ++) {
				3484	// int j = expected[i] + 1;
				3485	int j = ustr.moveIndex32(expected[i], 1);
				3486	for (; j <= expected[i + 1]; j ++) {
				3487	int32_t expectedPreceding = expected[i];
				3488	int32_t actualPreceding = bi->preceding(j);
				3489	if (actualPreceding != expectedPreceding) {
				3490	printStringBreaks(ustr, expected, expectedcount);
				3491	test->errln("%s:%d preceding(%d): expected %d, got %d",
				3492	__FILE__, __LINE__, j, expectedPreceding, actualPreceding);
				3493	return;
				3494	}
				3495	}
				3496	}
				3497	}
				3498	#endif
				3499
				3500	void RBBITest::TestWordBreaks(void)
				3501	{
				3502	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				3503
				3504	Locale locale("en");
				3505	UErrorCode status = U_ZERO_ERROR;
				3506	// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
				3507	BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
				3508	// Replaced any C+J characters in a row with a random sequence of characters
				3509	// of the same length to make our C+J segmentation not get in the way.
				3510	static const char *strlist[] =
				3511	{
				3512	"\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
				3513	"\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
				3514	"\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
				3515	"\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
				3516	"\\uac00\\u3588\\u009c\\u0953\\u194b",
				3517	"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
				3518	"\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
				3519	"\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
				3520	"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
				3521	"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
				3522	"\\u2027\\U000e0067\\u0a47\\u00b7",
				3523	"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
				3524	"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
				3525	"\\u0589\\U000e006e\\u0a42\\U000104a5",
				3526	"\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
				3527	"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
				3528	"\\u0027\\u11af\\U000e0057\\u0602",
				3529	"\\U0001d7f2\\U000e007\\u0004\\u0589",
				3530	"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
				3531	"\\U0001d7f2\\U000e007d\\u0004\\u0589",
				3532	"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
				3533	"\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
				3534	"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
				3535	"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
				3536	"\\u0233\\U000e0020\\u0a69\\u0d6a",
				3537	"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
				3538	"\\u18f4\\U000e0049\\u20e7\\u2027",
				3539	"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
				3540	"\\ua183\\u102d\\u0bec\\u003a",
				3541	"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
				3542	"\\u003a\\u0e57\\u0fad\\u002e",
				3543	"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
				3544	"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
				3545	"\\U000e005d\\u2044\\u0731\\u0650\\u0061",
				3546	"\\u003a\\u0664\\u00b7\\u1fba",
				3547	"\\u003b\\u0027\\u00b7\\u47a3",
				3548	"\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
				3549	"\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
				3550	"\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
				3551	};
				3552	int loop;
				3553	if (U_FAILURE(status)) {
				3554	errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
				3555	return;
				3556	}
				3557	for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
				3558	// printf("looping %d\n", loop);
				3559	UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
				3560	// RBBICharMonkey monkey;
				3561	RBBIWordMonkey monkey;
				3562
				3563	int expected[50];
				3564	int expectedcount = 0;
				3565
				3566	monkey.setText(ustr);
				3567	int i;
				3568	for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
				3569	expected[expectedcount ++] = i;
				3570	}
				3571
				3572	testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
				3573	}
				3574	delete bi;
				3575	#endif
				3576	}
				3577
				3578	void RBBITest::TestWordBoundary(void)
				3579	{
				3580	// <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
				3581	Locale locale("en");
				3582	UErrorCode status = U_ZERO_ERROR;
				3583	// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
				3584	LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
				3585	if (U_FAILURE(status)) {
				3586	errcheckln(status, "%s:%d Creation of break iterator failed %s",
				3587	__FILE__, __LINE__, u_errorName(status));
				3588	return;
				3589	}
				3590	UChar str[50];
				3591	static const char *strlist[] =
				3592	{
				3593	"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
				3594	"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
				3595	"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
				3596	"\\u2027\\U000e0067\\u0a47\\u00b7",
				3597	"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
				3598	"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
				3599	"\\u0589\\U000e006e\\u0a42\\U000104a5",
				3600	"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
				3601	"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
				3602	"\\u0027\\u11af\\U000e0057\\u0602",
				3603	"\\U0001d7f2\\U000e007\\u0004\\u0589",
				3604	"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
				3605	"\\U0001d7f2\\U000e007d\\u0004\\u0589",
				3606	"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
				3607	"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
				3608	"\\U000e0065\\u302c\\u09ee\\U000e0068",
				3609	"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
				3610	"\\u0233\\U000e0020\\u0a69\\u0d6a",
				3611	"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
				3612	"\\u58f4\\U000e0049\\u20e7\\u2027",
				3613	"\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
				3614	"\\ua183\\u102d\\u0bec\\u003a",
				3615	"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
				3616	"\\u003a\\u0e57\\u0fad\\u002e",
				3617	"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
				3618	"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
				3619	"\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
				3620	"\\u003a\\u0664\\u00b7\\u1fba",
				3621	"\\u003b\\u0027\\u00b7\\u47a3",
				3622	};
				3623	int loop;
				3624	for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
				3625	u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
				3626	UnicodeString ustr(str);
				3627	int forward[50];
				3628	int count = 0;
				3629
				3630	bi->setText(ustr);
				3631	int prev = -1;
				3632	for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
				3633	++count;
				3634	if (count >= UPRV_LENGTHOF(forward)) {
				3635	errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
				3636	__FILE__, __LINE__, loop, count, boundary);
				3637	return;
				3638	}
				3639	forward[count] = boundary;
				3640	if (boundary <= prev) {
				3641	errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
				3642	__FILE__, __LINE__, loop, prev, boundary);
				3643	break;
				3644	}
				3645	for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
				3646	if (bi->isBoundary(nonBoundary)) {
				3647	printStringBreaks(ustr, forward, count);
				3648	errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
				3649	__FILE__, __LINE__, loop, prev, nonBoundary, boundary);
				3650	return;
				3651	}
				3652	}
				3653	if (!bi->isBoundary(boundary)) {
				3654	printStringBreaks(ustr, forward, count);
				3655	errln("%s:%d happy boundary test failed: expected %d a boundary",
				3656	__FILE__, __LINE__, boundary);
				3657	return;
				3658	}
				3659	prev = boundary;
				3660	}
				3661	}
				3662	}
				3663
				3664	void RBBITest::TestLineBreaks(void)
				3665	{
				3666	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				3667	Locale locale("en");
				3668	UErrorCode status = U_ZERO_ERROR;
				3669	BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
				3670	const int32_t STRSIZE = 50;
				3671	UChar str[STRSIZE];
				3672	static const char *strlist[] =
				3673	{
				3674	"\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
				3675	"\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
				3676	"U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
				3677	"\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
				3678	"u2014\\U000e0105\\u118c\\u000a\\u07f8",
				3679	"\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
				3680	"\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
				3681	"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
				3682	"\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
				3683	"\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
				3684	"\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
				3685	"\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
				3686	"\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
				3687	"\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
				3688	"\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
				3689	"\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
				3690	"\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
				3691	"\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
				3692	"\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
				3693	"\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
				3694	"\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
				3695	"\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
				3696	"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
				3697	"\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
				3698	"\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
				3699	"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
				3700	"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
				3701	"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
				3702	"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
				3703	"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
				3704	"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
				3705	"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
				3706	"\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
				3707	"\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
				3708	"\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
				3709	"\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
				3710	"\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
				3711	"\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
				3712	"\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
				3713	"\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
				3714	};
				3715	int loop;
				3716	TEST_ASSERT_SUCCESS(status);
				3717	if (U_FAILURE(status)) {
				3718	return;
				3719	}
				3720	for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
				3721	// printf("looping %d\n", loop);
				3722	int32_t t = u_unescape(strlist[loop], str, STRSIZE);
				3723	if (t >= STRSIZE) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3724	TEST_ASSERT(false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3725	continue;
				3726	}
				3727
				3728
				3729	UnicodeString ustr(str);
				3730	RBBILineMonkey monkey;
				3731	if (U_FAILURE(monkey.deferredStatus)) {
				3732	continue;
				3733	}
				3734
				3735	const int EXPECTEDSIZE = 50;
				3736	int expected[EXPECTEDSIZE];
				3737	int expectedcount = 0;
				3738
				3739	monkey.setText(ustr);
				3740
				3741	int i;
				3742	for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
				3743	if (expectedcount >= EXPECTEDSIZE) {
				3744	TEST_ASSERT(expectedcount < EXPECTEDSIZE);
				3745	return;
				3746	}
				3747	expected[expectedcount ++] = i;
				3748	}
				3749
				3750	testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
				3751	}
				3752	delete bi;
				3753	#endif
				3754	}
				3755
				3756	void RBBITest::TestSentBreaks(void)
				3757	{
				3758	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				3759	Locale locale("en");
				3760	UErrorCode status = U_ZERO_ERROR;
				3761	BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
				3762	UChar str[200];
				3763	static const char *strlist[] =
				3764	{
				3765	"Now\ris\nthe\r\ntime\n\rfor\r\r",
				3766	"This\n",
				3767	"Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
				3768	"\"Sentence ending with a quote.\" Bye.",
				3769	" (This is it). Testing the sentence iterator. \"This isn't it.\"",
				3770	"Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
				3771	"Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
				3772	"Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
				3773	"Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
				3774	"Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
				3775	"\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
				3776	"\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
				3777	"\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
				3778	"\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
				3779	"\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
				3780	"\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
				3781	"\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
				3782	"\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
				3783	"\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
				3784	"\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
				3785	};
				3786	int loop;
				3787	if (U_FAILURE(status)) {
				3788	errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
				3789	return;
				3790	}
				3791	for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
				3792	u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
				3793	UnicodeString ustr(str);
				3794
				3795	RBBISentMonkey monkey;
				3796	if (U_FAILURE(monkey.deferredStatus)) {
				3797	continue;
				3798	}
				3799
				3800	const int EXPECTEDSIZE = 50;
				3801	int expected[EXPECTEDSIZE];
				3802	int expectedcount = 0;
				3803
				3804	monkey.setText(ustr);
				3805
				3806	int i;
				3807	for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
				3808	if (expectedcount >= EXPECTEDSIZE) {
				3809	TEST_ASSERT(expectedcount < EXPECTEDSIZE);
				3810	return;
				3811	}
				3812	expected[expectedcount ++] = i;
				3813	}
				3814
				3815	testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
				3816	}
				3817	delete bi;
				3818	#endif
				3819	}
				3820
				3821	void RBBITest::TestMonkey() {
				3822	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				3823
				3824	UErrorCode status = U_ZERO_ERROR;
				3825	int32_t loopCount = 500;
				3826	int32_t seed = 1;
				3827	UnicodeString breakType = "all";
				3828	Locale locale("en");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3829	UBool useUText = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3830
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3831	if (quick == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3832	loopCount = 10000;
				3833	}
				3834
				3835	if (fTestParams) {
				3836	UnicodeString p(fTestParams);
				3837	loopCount = getIntParam("loop", p, loopCount);
				3838	seed = getIntParam("seed", p, seed);
				3839
				3840	RegexMatcher m(" type = (char\|word\|line\|sent\|title) ", p, 0, status);
				3841	if (m.find()) {
				3842	breakType = m.group(1, status);
				3843	m.reset();
				3844	p = m.replaceFirst("", status);
				3845	}
				3846
				3847	RegexMatcher u(" *utext", p, 0, status);
				3848	if (u.find()) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3849	useUText = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3850	u.reset();
				3851	p = u.replaceFirst("", status);
				3852	}
				3853
				3854
				3855	// m.reset(p);
				3856	if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
				3857	// Each option is stripped out of the option string as it is processed.
				3858	// All options have been checked. The option string should have been completely emptied..
				3859	char buf[100];
				3860	p.extract(buf, sizeof(buf), NULL, status);
				3861	buf[sizeof(buf)-1] = 0;
				3862	errln("Unrecognized or extra parameter: %s\n", buf);
				3863	return;
				3864	}
				3865
				3866	}
				3867
				3868	if (breakType == "char" \|\| breakType == "all") {
				3869	RBBICharMonkey m;
				3870	BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
				3871	if (U_SUCCESS(status)) {
				3872	RunMonkey(bi, m, "char", seed, loopCount, useUText);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3873	if (breakType == "all" && useUText==false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3874	// Also run a quick test with UText when "all" is specified
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3875	RunMonkey(bi, m, "char", seed, loopCount, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3876	}
				3877	}
				3878	else {
				3879	errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
				3880	}
				3881	delete bi;
				3882	}
				3883
				3884	if (breakType == "word" \|\| breakType == "all") {
				3885	logln("Word Break Monkey Test");
				3886	RBBIWordMonkey m;
				3887	BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
				3888	if (U_SUCCESS(status)) {
				3889	RunMonkey(bi, m, "word", seed, loopCount, useUText);
				3890	}
				3891	else {
				3892	errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
				3893	}
				3894	delete bi;
				3895	}
				3896
				3897	if (breakType == "line" \|\| breakType == "all") {
				3898	logln("Line Break Monkey Test");
				3899	RBBILineMonkey m;
				3900	BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
				3901	if (loopCount >= 10) {
				3902	loopCount = loopCount / 5; // Line break runs slower than the others.
				3903	}
				3904	if (U_SUCCESS(status)) {
				3905	RunMonkey(bi, m, "line", seed, loopCount, useUText);
				3906	}
				3907	else {
				3908	errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
				3909	}
				3910	delete bi;
				3911	}
				3912
				3913	if (breakType == "sent" \|\| breakType == "all" ) {
				3914	logln("Sentence Break Monkey Test");
				3915	RBBISentMonkey m;
				3916	BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
				3917	if (loopCount >= 10) {
				3918	loopCount = loopCount / 10; // Sentence runs slower than the other break types
				3919	}
				3920	if (U_SUCCESS(status)) {
				3921	RunMonkey(bi, m, "sent", seed, loopCount, useUText);
				3922	}
				3923	else {
				3924	errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
				3925	}
				3926	delete bi;
				3927	}
				3928
				3929	#endif
				3930	}
				3931
				3932	//
				3933	// Run a RBBI monkey test. Common routine, for all break iterator types.
				3934	// Parameters:
				3935	// bi - the break iterator to use
				3936	// mk - MonkeyKind, abstraction for obtaining expected results
				3937	// name - Name of test (char, word, etc.) for use in error messages
				3938	// seed - Seed for starting random number generator (parameter from user)
				3939	// numIterations
				3940	//
				3941	void RBBITest::RunMonkey(BreakIterator bi, RBBIMonkeyKind &mk, const char name, uint32_t seed,
				3942	int32_t numIterations, UBool useUText) {
				3943
				3944	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				3945
				3946	const int32_t TESTSTRINGLEN = 500;
				3947	UnicodeString testText;
				3948	int32_t numCharClasses;
				3949	UVector *chClasses;
				3950	int expectedCount = 0;
				3951	char expectedBreaks[TESTSTRINGLEN*2 + 1];
				3952	char forwardBreaks[TESTSTRINGLEN*2 + 1];
				3953	char reverseBreaks[TESTSTRINGLEN*2+1];
				3954	char isBoundaryBreaks[TESTSTRINGLEN*2+1];
				3955	char followingBreaks[TESTSTRINGLEN*2+1];
				3956	char precedingBreaks[TESTSTRINGLEN*2+1];
				3957	int i;
				3958	int loopCount = 0;
				3959
				3960
				3961	m_seed = seed;
				3962
				3963	numCharClasses = mk.charClasses()->size();
				3964	chClasses = mk.charClasses();
				3965
				3966	// Check for errors that occurred during the construction of the MonkeyKind object.
				3967	// Can't report them where they occurred because errln() is a method coming from intlTest,
				3968	// and is not visible outside of RBBITest :-(
				3969	if (U_FAILURE(mk.deferredStatus)) {
				3970	errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
				3971	return;
				3972	}
				3973
				3974	// Verify that the character classes all have at least one member.
				3975	for (i=0; i<numCharClasses; i++) {
				3976	UnicodeSet s = (UnicodeSet )chClasses->elementAt(i);
				3977	if (s == NULL \|\| s->size() == 0) {
				3978	errln("Character Class #%d is null or of zero size.", i);
				3979	return;
				3980	}
				3981	}
				3982
				3983	// For minimizing width of class name output.
				3984	int classNameSize = mk.maxClassNameSize();
				3985
				3986	while (loopCount < numIterations \|\| numIterations == -1) {
				3987	if (numIterations == -1 && loopCount % 10 == 0) {
				3988	// If test is running in an infinite loop, display a periodic tic so
				3989	// we can tell that it is making progress.
				3990	fprintf(stderr, ".");
				3991	}
				3992	// Save current random number seed, so that we can recreate the random numbers
				3993	// for this loop iteration in event of an error.
				3994	seed = m_seed;
				3995
				3996	// Populate a test string with data.
				3997	testText.truncate(0);
				3998	for (i=0; i<TESTSTRINGLEN; i++) {
				3999	int32_t aClassNum = m_rand() % numCharClasses;
				4000	UnicodeSet classSet = (UnicodeSet )chClasses->elementAt(aClassNum);
				4001	int32_t charIdx = m_rand() % classSet->size();
				4002	UChar32 c = classSet->charAt(charIdx);
				4003	if (c < 0) { // TODO: deal with sets containing strings.
				4004	errln("%s:%d c < 0", __FILE__, __LINE__);
				4005	break;
				4006	}
				4007	// Do not assemble a supplementary character from randomly generated separate surrogates.
				4008	// (It could be a dictionary character)
				4009	if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
				4010	continue;
				4011	}
				4012
				4013	testText.append(c);
				4014	}
				4015
				4016	// Calculate the expected results for this test string and reset applied rules.
				4017	mk.setText(testText);
				4018
				4019	memset(expectedBreaks, 0, sizeof(expectedBreaks));
				4020	expectedBreaks[0] = 1;
				4021	int32_t breakPos = 0;
				4022	expectedCount = 0;
				4023	for (;;) {
				4024	breakPos = mk.next(breakPos);
				4025	if (breakPos == -1) {
				4026	break;
				4027	}
				4028	if (breakPos > testText.length()) {
				4029	errln("breakPos > testText.length()");
				4030	}
				4031	expectedBreaks[breakPos] = 1;
				4032	expectedCount++;
				4033	U_ASSERT(expectedCount<testText.length());
				4034	}
				4035
				4036	// Find the break positions using forward iteration
				4037	memset(forwardBreaks, 0, sizeof(forwardBreaks));
				4038	if (useUText) {
				4039	UErrorCode status = U_ZERO_ERROR;
				4040	UText *testUText = utext_openReplaceable(NULL, &testText, &status);
				4041	// testUText = utext_openUnicodeString(testUText, &testText, &status);
				4042	bi->setText(testUText, status);
				4043	TEST_ASSERT_SUCCESS(status);
				4044	utext_close(testUText); // The break iterator does a shallow clone of the UText
				4045	// This UText can be closed immediately, so long as the
				4046	// testText string continues to exist.
				4047	} else {
				4048	bi->setText(testText);
				4049	}
				4050
				4051	for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
				4052	if (i < 0 \|\| i > testText.length()) {
				4053	errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
				4054	break;
				4055	}
				4056	forwardBreaks[i] = 1;
				4057	}
				4058
				4059	// Find the break positions using reverse iteration
				4060	memset(reverseBreaks, 0, sizeof(reverseBreaks));
				4061	for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
				4062	if (i < 0 \|\| i > testText.length()) {
				4063	errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
				4064	break;
				4065	}
				4066	reverseBreaks[i] = 1;
				4067	}
				4068
				4069	// Find the break positions using isBoundary() tests.
				4070	memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
				4071	U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
				4072	for (i=0; i<=testText.length(); i++) {
				4073	isBoundaryBreaks[i] = bi->isBoundary(i);
				4074	}
				4075
				4076
				4077	// Find the break positions using the following() function.
				4078	// printf(".");
				4079	memset(followingBreaks, 0, sizeof(followingBreaks));
				4080	int32_t lastBreakPos = 0;
				4081	followingBreaks[0] = 1;
				4082	for (i=0; i<testText.length(); i++) {
				4083	breakPos = bi->following(i);
				4084	if (breakPos <= i \|\|
				4085	breakPos < lastBreakPos \|\|
				4086	breakPos > testText.length() \|\|
				4087	(breakPos > lastBreakPos && lastBreakPos > i)) {
				4088	errln("%s break monkey test: "
				4089	"Out of range value returned by BreakIterator::following().\n"
				4090	"Random seed=%d index=%d; following returned %d; lastbreak=%d",
				4091	name, seed, i, breakPos, lastBreakPos);
				4092	break;
				4093	}
				4094	followingBreaks[breakPos] = 1;
				4095	lastBreakPos = breakPos;
				4096	}
				4097
				4098	// Find the break positions using the preceding() function.
				4099	memset(precedingBreaks, 0, sizeof(precedingBreaks));
				4100	lastBreakPos = testText.length();
				4101	precedingBreaks[testText.length()] = 1;
				4102	for (i=testText.length(); i>0; i--) {
				4103	breakPos = bi->preceding(i);
				4104	if (breakPos >= i \|\|
				4105	breakPos > lastBreakPos \|\|
				4106	(breakPos < 0 && testText.getChar32Start(i)>0) \|\|
				4107	(breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
				4108	errln("%s break monkey test: "
				4109	"Out of range value returned by BreakIterator::preceding().\n"
				4110	"index=%d; prev returned %d; lastBreak=%d" ,
				4111	name, i, breakPos, lastBreakPos);
				4112	if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
				4113	precedingBreaks[i] = 2; // Forces an error.
				4114	}
				4115	} else {
				4116	if (breakPos >= 0) {
				4117	precedingBreaks[breakPos] = 1;
				4118	}
				4119	lastBreakPos = breakPos;
				4120	}
				4121	}
				4122
				4123	// Compare the expected and actual results.
				4124	for (i=0; i<=testText.length(); i++) {
				4125	const char *errorType = NULL;
				4126	const char* currentBreakData = NULL;
				4127	if (forwardBreaks[i] != expectedBreaks[i]) {
				4128	errorType = "next()";
				4129	currentBreakData = forwardBreaks;
				4130	} else if (reverseBreaks[i] != forwardBreaks[i]) {
				4131	errorType = "previous()";
				4132	currentBreakData = reverseBreaks;
				4133	} else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
				4134	errorType = "isBoundary()";
				4135	currentBreakData = isBoundaryBreaks;
				4136	} else if (followingBreaks[i] != expectedBreaks[i]) {
				4137	errorType = "following()";
				4138	currentBreakData = followingBreaks;
				4139	} else if (precedingBreaks[i] != expectedBreaks[i]) {
				4140	errorType = "preceding()";
				4141	currentBreakData = precedingBreaks;
				4142	}
				4143
				4144	if (errorType != NULL) {
				4145	// Format a range of the test text that includes the failure as
				4146	// a data item that can be included in the rbbi test data file.
				4147
				4148	// Start of the range is the last point where expected and actual results
				4149	// both agreed that there was a break position.
				4150
				4151	int startContext = i;
				4152	int32_t count = 0;
				4153	for (;;) {
				4154	if (startContext==0) { break; }
				4155	startContext --;
				4156	if (expectedBreaks[startContext] != 0) {
				4157	if (count == 2) break;
				4158	count ++;
				4159	}
				4160	}
				4161
				4162	// End of range is two expected breaks past the start position.
				4163	int endContext = i + 1;
				4164	int ci;
				4165	for (ci=0; ci<2; ci++) { // Number of items to include in error text.
				4166	for (;;) {
				4167	if (endContext >= testText.length()) {break;}
				4168	if (expectedBreaks[endContext-1] != 0) {
				4169	if (count == 0) break;
				4170	count --;
				4171	}
				4172	endContext ++;
				4173	}
				4174	}
				4175
				4176	// Formatting of each line includes:
				4177	// character code
				4178	// reference break: '\|' -> a break, '.' -> no break
				4179	// actual break: '\|' -> a break, '.' -> no break
				4180	// (name of character clase)
				4181	// Unicode name of character
				4182	// '-->' indicates location of the difference.
				4183
				4184	MONKEY_ERROR(
				4185	(expectedBreaks[i] ? "Break expected but not found" :
				4186	"Break found but not expected"),
				4187	name, i, seed);
				4188
				4189	for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
				4190	UChar32 c;
				4191	c = testText.char32At(ci);
				4192
				4193	std::string currentLineFlag = " ";
				4194	if (ci == i) {
				4195	currentLineFlag = "-->"; // Error position
				4196	}
				4197
				4198	// BMP or SMP character in hex
				4199	char hexCodePoint[12];
				4200	std::string format = " \\u%04x";
				4201	if (c >= 0x10000) {
				4202	format = "\\U%08x";
				4203	}
				4204	sprintf(hexCodePoint, format.c_str(), c);
				4205
				4206	// Get the class name and character name for the character.
				4207	char cName[200];
				4208	UErrorCode status = U_ZERO_ERROR;
				4209	u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
				4210
				4211	char buffer[200];
				4212	auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
				4213	"%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
				4214	currentLineFlag.c_str(),
				4215	ci,
				4216	expectedBreaks[ci] == 0 ? "." : "\|", // Reference break
				4217	currentBreakData[ci] == 0 ? "." : "\|", // Actual break
				4218	hexCodePoint,
				4219	classNameSize,
				4220	mk.classNameFromCodepoint(c).c_str(),
				4221	mk.getAppliedRule(ci).c_str(), cName);
				4222	(void)ret;
				4223	U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
				4224
				4225	// Output the error
				4226	if (ci == i) {
				4227	errln(buffer);
				4228	} else {
				4229	infoln(buffer);
				4230	}
				4231
				4232	if (ci >= endContext) { break; }
				4233	}
				4234	break;
				4235	}
				4236	}
				4237
				4238	loopCount++;
				4239	}
				4240	#endif
				4241	}
				4242
				4243
				4244	// Bug 5532. UTF-8 based UText fails in dictionary code.
				4245	// This test checks the initial patch,
				4246	// which is to just keep it from crashing. Correct word boundaries
				4247	// await a proper fix to the dictionary code.
				4248	//
				4249	void RBBITest::TestBug5532(void) {
				4250	// Text includes a mixture of Thai and Latin.
				4251	const unsigned char utf8Data[] = {
				4252	0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
				4253	0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
				4254	0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
				4255	0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
				4256	0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
				4257	0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
				4258	0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
				4259	0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
				4260	0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
				4261	0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
				4262	0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
				4263
				4264	UErrorCode status = U_ZERO_ERROR;
				4265	UText utext=UTEXT_INITIALIZER;
				4266	utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
				4267	TEST_ASSERT_SUCCESS(status);
				4268
				4269	BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
				4270	TEST_ASSERT_SUCCESS(status);
				4271	if (U_SUCCESS(status)) {
				4272	bi->setText(&utext, status);
				4273	TEST_ASSERT_SUCCESS(status);
				4274
				4275	int32_t breakCount = 0;
				4276	int32_t previousBreak = -1;
				4277	for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
				4278	// For now, just make sure that the break iterator doesn't hang.
				4279	TEST_ASSERT(previousBreak < bi->current());
				4280	previousBreak = bi->current();
				4281	}
				4282	TEST_ASSERT(breakCount > 0);
				4283	}
				4284	delete bi;
				4285	utext_close(&utext);
				4286	}
				4287
				4288
				4289	void RBBITest::TestBug9983(void) {
				4290	UnicodeString text = UnicodeString("\\u002A" // * Other
				4291	"\\uFF65" // Other
				4292	"\\u309C" // Katakana
				4293	"\\uFF9F" // Extend
				4294	"\\uFF65" // Other
				4295	"\\u0020" // Other
				4296	"\\u0000").unescape();
				4297
				4298	UErrorCode status = U_ZERO_ERROR;
				4299	LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
				4300	BreakIterator::createWordInstance(Locale::getRoot(), status)));
				4301	TEST_ASSERT_SUCCESS(status);
				4302	LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
				4303	BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
				4304	TEST_ASSERT_SUCCESS(status);
				4305	if (U_FAILURE(status)) {
				4306	return;
				4307	}
				4308	int32_t offset, rstatus, iterationCount;
				4309
				4310	brkiter->setText(text);
				4311	brkiter->last();
				4312	iterationCount = 0;
				4313	while ( (offset = brkiter->previous()) != UBRK_DONE ) {
				4314	iterationCount++;
				4315	rstatus = brkiter->getRuleStatus();
				4316	(void)rstatus; // Suppress set but not used warning.
				4317	if (iterationCount >= 10) {
				4318	break;
				4319	}
				4320	}
				4321	TEST_ASSERT(iterationCount == 6);
				4322
				4323	brkiterPOSIX->setText(text);
				4324	brkiterPOSIX->last();
				4325	iterationCount = 0;
				4326	while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
				4327	iterationCount++;
				4328	rstatus = brkiterPOSIX->getRuleStatus();
				4329	(void)rstatus; // Suppress set but not used warning.
				4330	if (iterationCount >= 10) {
				4331	break;
				4332	}
				4333	}
				4334	TEST_ASSERT(iterationCount == 6);
				4335	}
				4336
				4337	// Bug 7547 - verify that building a break itereator from empty rules produces an error.
				4338	//
				4339	void RBBITest::TestBug7547() {
				4340	UnicodeString rules;
				4341	UErrorCode status = U_ZERO_ERROR;
				4342	UParseError parseError;
				4343	RuleBasedBreakIterator breakIterator(rules, parseError, status);
				4344	if (status != U_BRK_RULE_SYNTAX) {
				4345	errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
				4346	}
				4347	if (parseError.line != 1 \|\| parseError.offset != 0) {
				4348	errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
				4349	}
				4350	}
				4351
				4352
				4353	void RBBITest::TestBug12797() {
				4354	UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
				4355	UErrorCode status = U_ZERO_ERROR;
				4356	UParseError parseError;
				4357	RuleBasedBreakIterator bi(rules, parseError, status);
				4358	if (U_FAILURE(status)) {
				4359	errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
				4360	return;
				4361	}
				4362	UnicodeString text = "abc";
				4363	bi.setText(text);
				4364	bi.first();
				4365	int32_t boundary = bi.next();
				4366	if (boundary != 3) {
				4367	errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
				4368	}
				4369	}
				4370
				4371	void RBBITest::TestBug12918() {
				4372	// This test triggers an assertion failure in dictbe.cpp
				4373	const UChar *crasherString = u"\u3325\u4a16";
				4374	UErrorCode status = U_ZERO_ERROR;
				4375	UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
				4376	if (U_FAILURE(status)) {
				4377	dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
				4378	return;
				4379	}
				4380	ubrk_first(iter);
				4381	int32_t pos = 0;
				4382	int32_t lastPos = -1;
				4383	while((pos = ubrk_next(iter)) != UBRK_DONE) {
				4384	if (pos <= lastPos) {
				4385	errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
				4386	break;
				4387	}
				4388	}
				4389	ubrk_close(iter);
				4390	}
				4391
				4392	void RBBITest::TestBug12932() {
				4393	// Node Stack overflow in the RBBI rule parser caused a seg fault.
				4394	UnicodeString ruleStr(
				4395	"((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
				4396	"((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
				4397	"(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
				4398	")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
				4399	")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
				4400	")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
				4401
				4402	UErrorCode status = U_ZERO_ERROR;
				4403	UParseError parseError;
				4404	RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
				4405	if (status != U_BRK_RULE_SYNTAX) {
				4406	errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
				4407	__FILE__, __LINE__, u_errorName(status));
				4408	}
				4409	}
				4410
				4411
				4412	// Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
				4413	// remain undevided by ICU char, word and line break.
				4414	void RBBITest::TestEmoji() {
				4415	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				4416	UErrorCode status = U_ZERO_ERROR;
				4417
				4418	CharString testFileName;
				4419	testFileName.append(IntlTest::getSourceTestData(status), status);
				4420	testFileName.appendPathPart("emoji-test.txt", status);
				4421	if (U_FAILURE(status)) {
				4422	errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
				4423	return;
				4424	}
				4425	logln("Opening data file %s\n", testFileName.data());
				4426
				4427	int len;
				4428	UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
				4429	if (U_FAILURE(status) \|\| testFile == NULL) {
				4430	errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
				4431	return;
				4432	}
				4433	UnicodeString testFileAsString(testFile, len);
				4434	delete [] testFile;
				4435
				4436	RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
				4437	RegexMatcher hexMatcher(u"\\s([a-f0-9])", UREGEX_CASE_INSENSITIVE, status);
				4438	// hexMatcher group(1) is a hex number, or empty string if no hex number present.
				4439	int32_t lineNumber = 0;
				4440
				4441	LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
				4442	LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
				4443	LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
				4444	if (U_FAILURE(status)) {
				4445	dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
				4446	return;
				4447	}
				4448
				4449	while (lineMatcher.find()) {
				4450	++lineNumber;
				4451	UnicodeString line = lineMatcher.group(status);
				4452	hexMatcher.reset(line);
				4453	UnicodeString testString; // accumulates the emoji sequence.
				4454	while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
				4455	UnicodeString hex = hexMatcher.group(1, status);
				4456	if (hex.length() > 8) {
				4457	errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
				4458	break;
				4459	}
				4460	CharString hex8;
				4461	hex8.appendInvariantChars(hex, status);
				4462	UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
				4463	if (c<=0x10ffff) {
				4464	testString.append(c);
				4465	} else {
				4466	errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
				4467	__FILE__, __LINE__, lineNumber, hex8.data());
				4468	break;
				4469	}
				4470	}
				4471
				4472	if (testString.length() > 1) {
				4473	charBreaks->setText(testString);
				4474	charBreaks->first();
				4475	int32_t firstBreak = charBreaks->next();
				4476	if (testString.length() != firstBreak) {
				4477	errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
				4478	__FILE__, __LINE__, lineNumber, firstBreak);
				4479	}
				4480	wordBreaks->setText(testString);
				4481	wordBreaks->first();
				4482	firstBreak = wordBreaks->next();
				4483	if (testString.length() != firstBreak) {
				4484	errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
				4485	__FILE__, __LINE__, lineNumber, firstBreak);
				4486	}
				4487	lineBreaks->setText(testString);
				4488	lineBreaks->first();
				4489	firstBreak = lineBreaks->next();
				4490	if (testString.length() != firstBreak) {
				4491	errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
				4492	__FILE__, __LINE__, lineNumber, firstBreak);
				4493	}
				4494	}
				4495	}
				4496	#endif
				4497	}
				4498
				4499
				4500	// TestBug12519 - Correct handling of Locales by assignment / copy / clone
				4501
				4502	void RBBITest::TestBug12519() {
				4503	UErrorCode status = U_ZERO_ERROR;
				4504	LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
				4505	LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
				4506	if (!assertSuccess(WHERE, status)) {
				4507	dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
				4508	return;
				4509	}
				4510	assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
				4511
				4512	assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
				4513	assertTrue(WHERE "Locales do not participate in BreakIterator equality.", biEn == biFr);
				4514
				4515	LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
				4516	assertTrue(WHERE, biEn == cloneEn);
				4517	assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
				4518
				4519	LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
				4520	assertTrue(WHERE, biFr == cloneFr);
				4521	assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
				4522
				4523	LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
				4524	UnicodeString text("Hallo Welt");
				4525	biDe->setText(text);
				4526	assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", biFr != biDe);
				4527	biDe = biFr;
				4528	assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", biFr == biDe);
				4529	}
				4530
				4531	void RBBITest::TestBug12677() {
				4532	// Check that stripping of comments from rules for getRules() is not confused by
				4533	// the presence of '#' characters in the rules that do not introduce comments.
				4534	UnicodeString rules(u"!!forward; \n"
				4535	"$x = [ab#]; # a set with a # literal. \n"
				4536	" # .; # a comment that looks sort of like a rule. \n"
				4537	" '#' '?'; # a rule with a quoted # \n"
				4538	);
				4539
				4540	UErrorCode status = U_ZERO_ERROR;
				4541	UParseError pe;
				4542	RuleBasedBreakIterator bi(rules, pe, status);
				4543	assertSuccess(WHERE, status);
				4544	UnicodeString rtRules = bi.getRules();
				4545	assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
				4546	}
				4547
				4548
				4549	void RBBITest::TestTableRedundancies() {
				4550	UErrorCode status = U_ZERO_ERROR;
				4551
				4552	LocalPointer<RuleBasedBreakIterator> bi (
				4553	(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
				4554	assertSuccess(WHERE, status);
				4555	if (U_FAILURE(status)) return;
				4556
				4557	RBBIDataWrapper *dw = bi->fData;
				4558	const RBBIStateTable *fwtbl = dw->fForwardTable;
				4559	UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
				4560	int32_t numCharClasses = dw->fHeader->fCatCount;
				4561	// printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
				4562
				4563	// Check for duplicate columns (character categories)
				4564
				4565	std::vector<UnicodeString> columns;
				4566	for (int32_t column = 0; column < numCharClasses; column++) {
				4567	UnicodeString s;
				4568	for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
				4569	RBBIStateTableRow row = (RBBIStateTableRow ) (fwtbl->fTableData + (fwtbl->fRowLen * r));
				4570	s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
				4571	}
				4572	columns.push_back(s);
				4573	}
				4574	// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
				4575	for (int c1=1; c1<numCharClasses; c1++) {
				4576	int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
				4577	for (int c2 = c1+1; c2 < limit; c2++) {
				4578	if (columns.at(c1) == columns.at(c2)) {
				4579	errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
				4580	goto out;
				4581	}
				4582	}
				4583	}
				4584	out:
				4585
				4586	// Check for duplicate states
				4587	std::vector<UnicodeString> rows;
				4588	for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
				4589	UnicodeString s;
				4590	RBBIStateTableRow row = (RBBIStateTableRow ) (fwtbl->fTableData + (fwtbl->fRowLen * r));
				4591	if (in8Bits) {
				4592	s.append(row->r8.fAccepting);
				4593	s.append(row->r8.fLookAhead);
				4594	s.append(row->r8.fTagsIdx);
				4595	for (int32_t column = 0; column < numCharClasses; column++) {
				4596	s.append(row->r8.fNextState[column]);
				4597	}
				4598	} else {
				4599	s.append(row->r16.fAccepting);
				4600	s.append(row->r16.fLookAhead);
				4601	s.append(row->r16.fTagsIdx);
				4602	for (int32_t column = 0; column < numCharClasses; column++) {
				4603	s.append(row->r16.fNextState[column]);
				4604	}
				4605	}
				4606	rows.push_back(s);
				4607	}
				4608	for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
				4609	for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
				4610	if (rows.at(r1) == rows.at(r2)) {
				4611	errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
				4612	return;
				4613	}
				4614	}
				4615	}
				4616	}
				4617
				4618	// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
				4619	// even after next() has returned DONE.
				4620
				4621	void RBBITest::TestBug13447() {
				4622	UErrorCode status = U_ZERO_ERROR;
				4623	LocalPointer<RuleBasedBreakIterator> bi(
				4624	(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
				4625	assertSuccess(WHERE, status);
				4626	if (U_FAILURE(status)) return;
				4627	UnicodeString data(u"1234");
				4628	bi->setText(data);
				4629	assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
				4630	assertEquals(WHERE, 4, bi->next());
				4631	assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
				4632	assertEquals(WHERE, UBRK_DONE, bi->next());
				4633	assertEquals(WHERE, 4, bi->current());
				4634	assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
				4635	}
				4636
				4637	// TestReverse exercises both the synthesized safe reverse rules and the logic
				4638	// for filling the break iterator cache when starting from random positions
				4639	// in the text.
				4640	//
				4641	// It's a monkey test, working on random data, with the expected data obtained
				4642	// from forward iteration (no safe rules involved), comparing with results
				4643	// when indexing into the interior of the string (safe rules needed).
				4644
				4645	void RBBITest::TestReverse() {
				4646	UErrorCode status = U_ZERO_ERROR;
				4647
				4648	TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
				4649	BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
				4650	assertSuccess(WHERE, status, true);
				4651	status = U_ZERO_ERROR;
				4652	TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
				4653	BreakIterator::createWordInstance(Locale::getEnglish(), status)));
				4654	assertSuccess(WHERE, status, true);
				4655	status = U_ZERO_ERROR;
				4656	TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
				4657	BreakIterator::createLineInstance(Locale::getEnglish(), status)));
				4658	assertSuccess(WHERE, status, true);
				4659	status = U_ZERO_ERROR;
				4660	TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
				4661	BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
				4662	assertSuccess(WHERE, status, true);
				4663	}
				4664
				4665	void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
				4666	if (!bi) {
				4667	return;
				4668	}
				4669
				4670	// From the mapping trie in the break iterator's internal data, create a
				4671	// vector of UnicodeStrings, one for each character category, containing
				4672	// all of the code points that map to that category. Unicode planes 0 and 1 only,
				4673	// to avoid an execess of unassigned code points.
				4674
				4675	RBBIDataWrapper *data = bi->fData;
				4676	int32_t categoryCount = data->fHeader->fCatCount;
				4677	UCPTrie *trie = data->fTrie;
				4678	bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
				4679	uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
				4680
				4681	std::vector<UnicodeString> strings(categoryCount, UnicodeString());
				4682	for (int cp=0; cp<0x1fff0; ++cp) {
				4683	int cat = ucptrie_get(trie, cp);
				4684	cat &= ~dictBit; // And off the dictionary bit from the category.
				4685	assertTrue(WHERE, cat < categoryCount && cat >= 0);
				4686	if (cat < 0 \|\| cat >= categoryCount) return;
				4687	strings[cat].append(cp);
				4688	}
				4689
				4690	icu_rand randomGen;
				4691	const int testStringLength = 10000;
				4692	UnicodeString testString;
				4693
				4694	for (int i=0; i<testStringLength; ++i) {
				4695	int charClass = randomGen() % categoryCount;
				4696	if (strings[charClass].length() > 0) {
				4697	int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
				4698	testString.append(cp);
				4699	}
				4700	}
				4701
				4702	typedef std::pair<UBool, int32_t> Result;
				4703	std::vector<Result> expectedResults;
				4704	bi->setText(testString);
				4705	for (int i=0; i<testString.length(); ++i) {
				4706	bool isboundary = bi->isBoundary(i);
				4707	int ruleStatus = bi->getRuleStatus();
				4708	expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
				4709	}
				4710
				4711	for (int i=testString.length()-1; i>=0; --i) {
				4712	bi->setText(testString); // clears the internal break cache
				4713	Result expected = expectedResults[i];
				4714	assertEquals(WHERE, expected.first, bi->isBoundary(i));
				4715	assertEquals(WHERE, expected.second, bi->getRuleStatus());
				4716	}
				4717	}
				4718
				4719
				4720	// Ticket 13692 - finding word boundaries in very large numbers or words could
				4721	// be very time consuming. When the problem was present, this void test
				4722	// would run more than fifteen minutes, which is to say, the failure was noticeale.
				4723
				4724	void RBBITest::TestBug13692() {
				4725	UErrorCode status = U_ZERO_ERROR;
				4726	LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
				4727	BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
				4728	if (!assertSuccess(WHERE, status, true)) {
				4729	return;
				4730	}
				4731	constexpr int32_t LENGTH = 1000000;
				4732	UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
				4733	for (int i=0; i<20; i+=2) {
				4734	longNumber.setCharAt(i, u' ');
				4735	}
				4736	bi->setText(longNumber);
				4737	assertFalse(WHERE, bi->isBoundary(LENGTH-5));
				4738	assertSuccess(WHERE, status);
				4739	}
				4740
				4741
				4742	void RBBITest::TestProperties() {
				4743	UErrorCode errorCode = U_ZERO_ERROR;
				4744	UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
				4745	if (!prependSet.isEmpty()) {
				4746	errln(
				4747	"[:GCB=Prepend:] is not empty any more. "
				4748	"Uncomment relevant lines in source/data/brkitr/char.txt and "
				4749	"change this test to the opposite condition.");
				4750	}
				4751	}
				4752
				4753
				4754	//
				4755	// TestDebug - A place-holder test for debugging purposes.
				4756	// For putting in fragments of other tests that can be invoked
				4757	// for tracing without a lot of unwanted extra stuff happening.
				4758	//
				4759	void RBBITest::TestDebug(void) {
				4760	UErrorCode status = U_ZERO_ERROR;
				4761	LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
				4762	BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
				4763	if (!assertSuccess(WHERE, status, true)) {
				4764	return;
				4765	}
				4766	const UnicodeString &rules = bi->getRules();
				4767	UParseError pe;
				4768	LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
				4769	assertSuccess(WHERE, status);
				4770	}
				4771
				4772
				4773	//
				4774	// TestDebugRules A stub test for use in debugging rule compilation problems.
				4775	// Can be freely altered as needed or convenient.
				4776	// Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
				4777	// data files may not be available in all environments.
				4778	// Any permanent test cases should be moved to rbbitst.txt
				4779	// (see Bug 20303 in that file, for example), or to another test function in this file.
				4780	//
				4781	void RBBITest::TestDebugRules() {
				4782	#if 0
				4783	const char16_t *rules = u""
				4784	"!!quoted_literals_only; \n"
				4785	"!!chain; \n"
				4786	"!!lookAheadHardBreak; \n"
				4787	" \n"
				4788	// "[a] / ; \n"
				4789	"[a] [b] / [c] [d]; \n"
				4790	"[a] [b] / [c] [d] {100}; \n"
				4791	"[x] [a] [b] / [c] [d] {100}; \n"
				4792	"[a] [b] [c] / [d] {100}; \n"
				4793	//" [c] [d] / [e] [f]; \n"
				4794	//"[a] [b] / [c]; \n"
				4795	;
				4796
				4797	UErrorCode status = U_ZERO_ERROR;
				4798	CharString path(pathToDataDirectory(), status);
				4799	path.appendPathPart("brkitr", status);
				4800	path.appendPathPart("rules", status);
				4801	path.appendPathPart("line.txt", status);
				4802	int len;
				4803	std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
				4804	if (!assertSuccess(WHERE, status)) {
				4805	return;
				4806	}
				4807
				4808	UParseError pe;
				4809	// rules = testFile.get();
				4810	RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
				4811
				4812	if (!assertSuccess(WHERE, status)) {
				4813	delete bi;
				4814	return;
				4815	}
				4816	// bi->dumpTables();
				4817
				4818	delete bi;
				4819	#endif
				4820	}
				4821
				4822	void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
				4823	UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
				4824	int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
				4825	// Text are duplicate characters from U+4E00 to U+4FFF
				4826	UnicodeString text;
				4827	for (UChar c = 0x4e00; c < 0x5000; c++) {
				4828	text.append(c).append(c);
				4829	}
				4830	// Generate rule which will caused length+4 character classes and
				4831	// length+3 states
				4832	UnicodeString rules(u"!!quoted_literals_only;");
				4833	for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
				4834	rules.append(u'\'').append(c).append(c).append(u"';");
				4835	}
				4836	rules.append(u".;");
				4837	UErrorCode status = U_ZERO_ERROR;
				4838	UParseError parseError;
				4839	RuleBasedBreakIterator bi(rules, parseError, status);
				4840
				4841	assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
				4842	assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
				4843	assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
				4844	assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
				4845	assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
				4846
				4847	bi.setText(text);
				4848
				4849	int32_t pos;
				4850	int32_t i = 0;
				4851	while ((pos = bi.next()) > 0) {
				4852	// The first numChar should not break between the pair
				4853	if (i++ < numChar) {
				4854	assertEquals(WHERE, i * 2, pos);
				4855	} else {
				4856	// After the first numChar next(), break on each character.
				4857	assertEquals(WHERE, i + numChar, pos);
				4858	}
				4859	}
				4860	while ((pos = bi.previous()) > 0) {
				4861	// The first numChar should not break between the pair
				4862	if (--i < numChar) {
				4863	assertEquals(WHERE, i * 2, pos);
				4864	} else {
				4865	// After the first numChar next(), break on each character.
				4866	assertEquals(WHERE, i + numChar, pos);
				4867	}
				4868	}
				4869	}
				4870
				4871	void RBBITest::Test8BitsTrieWith8BitStateTable() {
				4872	testTrieStateTable(251, true /* expectedTrieWidthIn8Bits /, true / expectedStateRowIn8Bits */);
				4873	}
				4874
				4875	void RBBITest::Test16BitsTrieWith8BitStateTable() {
				4876	testTrieStateTable(252, false /* expectedTrieWidthIn8Bits /, true / expectedStateRowIn8Bits */);
				4877	}
				4878
				4879	void RBBITest::Test16BitsTrieWith16BitStateTable() {
				4880	testTrieStateTable(253, false /* expectedTrieWidthIn8Bits /, false / expectedStateRowIn8Bits */);
				4881	}
				4882
				4883	void RBBITest::Test8BitsTrieWith16BitStateTable() {
				4884	// Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
				4885	// create state table in 16 bits.
				4886
				4887	// Generate 510 'a' as text
				4888	UnicodeString text;
				4889	for (int32_t i = 0; i < 510; i++) {
				4890	text.append(u'a');
				4891	}
				4892
				4893	UnicodeString rules(u"!!quoted_literals_only;'");
				4894	// 254 'a' in the rule will cause 256 states
				4895	for (int32_t i = 0; i < 254; i++) {
				4896	rules.append(u'a');
				4897	}
				4898	rules.append(u"';.;");
				4899
				4900	UErrorCode status = U_ZERO_ERROR;
				4901	UParseError parseError;
				4902	LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
				4903
				4904	assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
				4905	assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
				4906	assertEquals(WHERE,
				4907	false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
				4908	bi->setText(text);
				4909
				4910	// break positions:
				4911	// 254, 508, 509, ... 510
				4912	assertEquals("next()", 254, bi->next());
				4913	int32_t i = 0;
				4914	int32_t pos;
				4915	while ((pos = bi->next()) > 0) {
				4916	assertEquals(WHERE, 508 + i , pos);
				4917	i++;
				4918	}
				4919	i = 0;
				4920	while ((pos = bi->previous()) > 0) {
				4921	i++;
				4922	if (pos >= 508) {
				4923	assertEquals(WHERE, 510 - i , pos);
				4924	} else {
				4925	assertEquals(WHERE, 254 , pos);
				4926	}
				4927	}
				4928	}
				4929
				4930	// Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
				4931	// that there are no problems with rules at the size that transitions between the two.
				4932	//
				4933	// A rule that matches a literal string, like 'abcdefghij', will require one state and
				4934	// one character class per character in the string. So we can make a rule to tickle the
				4935	// boundaries by using literal strings of various lengths.
				4936	//
				4937	// For both the number of states and the number of character classes, the eight bit format
				4938	// only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
				4939	// leaving 120 something available. This test runs the string over the range of 120 - 130,
				4940	// which allows some margin for changes to the number of values reserved by the rule builder
				4941	// without breaking the test.
				4942
				4943	void RBBITest::TestTable_8_16_Bits() {
				4944
				4945	// testStr serves as both the source of the rule string (truncated to the desired length)
				4946	// and as test data to check matching behavior. A break rule consisting of the first 120
				4947	// characters of testStr will match the first 120 chars of the full-length testStr.
				4948	UnicodeString testStr;
				4949	for (UChar c=0x3000; c<0x3200; ++c) {
				4950	testStr.append(c);
				4951	}
				4952
				4953	const int32_t startLength = 120; // The shortest rule string to test.
				4954	const int32_t endLength = 260; // The longest rule string to test
				4955	const int32_t increment = this->quick ? endLength - startLength : 1;
				4956
				4957	for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
				4958	UParseError parseError;
				4959	UErrorCode status = U_ZERO_ERROR;
				4960
				4961	UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
				4962	ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
				4963	RuleBasedBreakIterator bi(ruleString, parseError, status);
				4964	if (!assertSuccess(WHERE, status)) {
				4965	errln(ruleString);
				4966	break;
				4967	}
				4968	// bi.dumpTables();
				4969
				4970	// Verify that the break iterator is functioning - that the first boundary found
				4971	// in testStr is at the length of the rule string.
				4972	bi.setText(testStr);
				4973	assertEquals(WHERE, ruleLen, bi.next());
				4974
				4975	// Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
				4976	// of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
				4977	bi.setText(testStr);
				4978	int32_t result = bi.preceding(ruleLen);
				4979	assertEquals(WHERE, 0, result);
				4980
				4981	// Verify that the range of rule lengths being tested cover the translations
				4982	// from 8 to 16 bit data.
				4983	bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
				4984	bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
				4985
				4986	if (ruleLen == startLength) {
				4987	assertEquals(WHERE, true, has8BitRowData);
				4988	assertEquals(WHERE, true, has8BitsTrie);
				4989	}
				4990	if (ruleLen == endLength) {
				4991	assertEquals(WHERE, false, has8BitRowData);
				4992	assertEquals(WHERE, false, has8BitsTrie);
				4993	}
				4994	}
				4995	}
				4996
				4997	/* Test handling of a large number of look-ahead rules.
				4998	* The number of rules in the test exceeds the implementation limits prior to the
				4999	* improvements introduced with #13590.
				5000	*
				5001	* The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
				5002	* The text being matched is sequential, "ABCDEFGHI..."
				5003	*
				5004	* The upshot is that the look-ahead rules all match on their preceding context,
				5005	* and consequently must save a potential result, but then fail to match on their
				5006	* trailing context, so that they don't actually cause a boundary.
				5007	*
				5008	* Additionally, add a ".*" rule, so there are no boundaries unless a
				5009	* look-ahead hard-break rule forces one.
				5010	*/
				5011	void RBBITest::TestBug13590() {
				5012	UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
				5013
				5014	const int NUM_LOOKAHEAD_RULES = 50;
				5015	const char16_t STARTING_CHAR = u'\u5000';
				5016	char16_t firstChar;
				5017	for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
				5018	firstChar = STARTING_CHAR + ruleNum*2;
				5019	rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
				5020	.append(u' ') .append(u'/') .append(u' ')
				5021	.append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
				5022	.append(u';') .append(u'\n');
				5023	}
				5024
				5025	// Change the last rule added from the form "UV / WY" to "UV / WX".
				5026	// Changes the rule so that it will match - all 4 chars are in ascending sequence.
				5027	rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
				5028
				5029	UErrorCode status = U_ZERO_ERROR;
				5030	UParseError parseError;
				5031	RuleBasedBreakIterator bi(rules, parseError, status);
				5032	if (!assertSuccess(WHERE, status)) {
				5033	errln(rules);
				5034	return;
				5035	}
				5036	// bi.dumpTables();
				5037
				5038	UnicodeString testString;
				5039	for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
				5040	testString.append(c);
				5041	}
				5042	bi.setText(testString);
				5043
				5044	int breaksFound = 0;
				5045	while (bi.next() != UBRK_DONE) {
				5046	++breaksFound;
				5047	}
				5048
				5049	// Two matches are expected, one from the last rule that was explicitly modified,
				5050	// and one at the end of the text.
				5051	assertEquals(WHERE, 2, breaksFound);
				5052	}
				5053
				5054
				5055	#if U_ENABLE_TRACING
				5056	static std::vector<std::string> gData;
				5057	static std::vector<int32_t> gEntryFn;
				5058	static std::vector<int32_t> gExitFn;
				5059	static std::vector<int32_t> gDataFn;
				5060
				5061	static void U_CALLCONV traceData(
				5062	const void*,
				5063	int32_t fnNumber,
				5064	int32_t,
				5065	const char *,
				5066	va_list args) {
				5067	if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
				5068	const char* data = va_arg(args, const char*);
				5069	gDataFn.push_back(fnNumber);
				5070	gData.push_back(data);
				5071	}
				5072	}
				5073
				5074	static void traceEntry(const void *, int32_t fnNumber) {
				5075	if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
				5076	gEntryFn.push_back(fnNumber);
				5077	}
				5078	}
				5079
				5080	static void traceExit(const void , int32_t fnNumber, const char , va_list) {
				5081	if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
				5082	gExitFn.push_back(fnNumber);
				5083	}
				5084	}
				5085
				5086
				5087	void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
				5088	assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
				5089	assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
				5090	assertEquals("utrace_exit should be called ", 1, gExitFn.size());
				5091	assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
				5092
				5093	if (expectedData == nullptr) {
				5094	assertEquals("utrace_data should not be called ", 0, gDataFn.size());
				5095	assertEquals("utrace_data should not be called ", 0, gData.size());
				5096	} else {
				5097	assertEquals("utrace_data should be called ", 1, gDataFn.size());
				5098	assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
				5099	assertEquals("utrace_data should be called ", 1, gData.size());
				5100	assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
				5101	}
				5102	}
				5103
				5104	void SetupTestTrace() {
				5105	gEntryFn.clear();
				5106	gExitFn.clear();
				5107	gDataFn.clear();
				5108	gData.clear();
				5109
				5110	const void* context = nullptr;
				5111	utrace_setFunctions(context, traceEntry, traceExit, traceData);
				5112	utrace_setLevel(UTRACE_INFO);
				5113	}
				5114
				5115	void RBBITest::TestTraceCreateCharacter(void) {
				5116	SetupTestTrace();
				5117	IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
				5118	LocalPointer<BreakIterator> brkitr(
				5119	BreakIterator::createCharacterInstance("zh-CN", status));
				5120	status.errIfFailureAndReset();
				5121	assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
				5122	}
				5123
				5124	void RBBITest::TestTraceCreateTitle(void) {
				5125	SetupTestTrace();
				5126	IcuTestErrorCode status(*this, "TestTraceCreateTitle");
				5127	LocalPointer<BreakIterator> brkitr(
				5128	BreakIterator::createTitleInstance("zh-CN", status));
				5129	status.errIfFailureAndReset();
				5130	assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
				5131	}
				5132
				5133	void RBBITest::TestTraceCreateSentence(void) {
				5134	SetupTestTrace();
				5135	IcuTestErrorCode status(*this, "TestTraceCreateSentence");
				5136	LocalPointer<BreakIterator> brkitr(
				5137	BreakIterator::createSentenceInstance("zh-CN", status));
				5138	status.errIfFailureAndReset();
				5139	assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
				5140	}
				5141
				5142	void RBBITest::TestTraceCreateWord(void) {
				5143	SetupTestTrace();
				5144	IcuTestErrorCode status(*this, "TestTraceCreateWord");
				5145	LocalPointer<BreakIterator> brkitr(
				5146	BreakIterator::createWordInstance("zh-CN", status));
				5147	status.errIfFailureAndReset();
				5148	assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
				5149	}
				5150
				5151	void RBBITest::TestTraceCreateLine(void) {
				5152	SetupTestTrace();
				5153	IcuTestErrorCode status(*this, "TestTraceCreateLine");
				5154	LocalPointer<BreakIterator> brkitr(
				5155	BreakIterator::createLineInstance("zh-CN", status));
				5156	status.errIfFailureAndReset();
Frank Tang	d2858cb	2022-04-08 20:34:12 -0700	[diff] [blame]	5157	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5158	}
				5159
				5160	void RBBITest::TestTraceCreateLineStrict(void) {
				5161	SetupTestTrace();
				5162	IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
				5163	LocalPointer<BreakIterator> brkitr(
				5164	BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
				5165	status.errIfFailureAndReset();
Frank Tang	d2858cb	2022-04-08 20:34:12 -0700	[diff] [blame]	5166	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5167	}
				5168
				5169	void RBBITest::TestTraceCreateLineNormal(void) {
				5170	SetupTestTrace();
				5171	IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
				5172	LocalPointer<BreakIterator> brkitr(
				5173	BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
				5174	status.errIfFailureAndReset();
Frank Tang	d2858cb	2022-04-08 20:34:12 -0700	[diff] [blame]	5175	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5176	}
				5177
				5178	void RBBITest::TestTraceCreateLineLoose(void) {
				5179	SetupTestTrace();
				5180	IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
				5181	LocalPointer<BreakIterator> brkitr(
				5182	BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
				5183	status.errIfFailureAndReset();
Frank Tang	d2858cb	2022-04-08 20:34:12 -0700	[diff] [blame]	5184	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
				5185	}
				5186
				5187	void RBBITest::TestTraceCreateLineLoosePhrase(void) {
				5188	SetupTestTrace();
				5189	IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
				5190	LocalPointer<BreakIterator> brkitr(
				5191	BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
				5192	status.errIfFailureAndReset();
				5193	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
				5194	}
				5195
				5196	void RBBITest::TestTraceCreateLineNormalPhrase(void) {
				5197	SetupTestTrace();
				5198	IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
				5199	LocalPointer<BreakIterator> brkitr(
				5200	BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
				5201	status.errIfFailureAndReset();
				5202	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
				5203	}
				5204
				5205	void RBBITest::TestTraceCreateLineStrictPhrase(void) {
				5206	SetupTestTrace();
				5207	IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
				5208	LocalPointer<BreakIterator> brkitr(
				5209	BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
				5210	status.errIfFailureAndReset();
				5211	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
				5212	}
				5213
				5214	void RBBITest::TestTraceCreateLinePhrase(void) {
				5215	SetupTestTrace();
				5216	IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
				5217	LocalPointer<BreakIterator> brkitr(
				5218	BreakIterator::createLineInstance("ja-u-lw-phrase", status));
				5219	status.errIfFailureAndReset();
				5220	assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5221	}
				5222
				5223	void RBBITest::TestTraceCreateBreakEngine(void) {
				5224	rbbi_cleanup();
				5225	SetupTestTrace();
				5226	IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
				5227	LocalPointer<BreakIterator> brkitr(
				5228	BreakIterator::createWordInstance("zh-CN", status));
				5229	status.errIfFailureAndReset();
				5230	assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
				5231
				5232	// To word break the following text, BreakIterator will create 5 dictionary
				5233	// break engine internally.
				5234	brkitr->setText(
				5235	u"test "
				5236	u"測試 " // Hani
				5237	u"សាកល្បង " // Khmr
				5238	u"ທົດສອບ " // Laoo
				5239	u"စမ်းသပ်မှု " // Mymr
				5240	u"ทดสอบ " // Thai
				5241	u"test "
				5242	);
				5243
				5244	// Loop through all the text.
				5245	while (brkitr->next() > 0) ;
				5246
				5247	assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
				5248	assertEquals("utrace_exit should be called ", 6, gExitFn.size());
				5249	assertEquals("utrace_data should be called ", 5, gDataFn.size());
				5250
				5251	for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
				5252	assertEquals("utrace_entry should be called ",
				5253	UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
				5254	assertEquals("utrace_exit should be called ",
				5255	UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
				5256	assertEquals("utrace_data should be called ",
				5257	UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
				5258	}
				5259
				5260	assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
				5261	assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
				5262	assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
				5263	assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
				5264	assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
				5265
				5266	}
				5267	#endif
				5268
				5269	void RBBITest::TestUnpairedSurrogate() {
				5270	UnicodeString rules(u"ab;");
				5271
				5272	UErrorCode status = U_ZERO_ERROR;
				5273	UParseError pe;
				5274	RuleBasedBreakIterator bi1(rules, pe, status);
				5275	assertSuccess(WHERE, status);
				5276	UnicodeString rtRules = bi1.getRules();
				5277	// make sure the simple one work first.
				5278	assertEquals(WHERE, rules, rtRules);
				5279
				5280
				5281	rules = UnicodeString(u"a\\ud800b;").unescape();
				5282	pe.line = 0;
				5283	pe.offset = 0;
				5284	RuleBasedBreakIterator bi2(rules, pe, status);
				5285	assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
				5286	if (pe.line != 1 \|\| pe.offset != 1) {
				5287	errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
				5288	}
				5289
				5290	status = U_ZERO_ERROR;
				5291	rules = UnicodeString(u"a\\ude00b;").unescape();
				5292	pe.line = 0;
				5293	pe.offset = 0;
				5294	RuleBasedBreakIterator bi3(rules, pe, status);
				5295	assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
				5296	if (pe.line != 1 \|\| pe.offset != 1) {
				5297	errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
				5298	}
				5299
				5300	// make sure the surrogate one work too.
				5301	status = U_ZERO_ERROR;
				5302	rules = UnicodeString(u"a😀b;");
				5303	RuleBasedBreakIterator bi4(rules, pe, status);
				5304	rtRules = bi4.getRules();
				5305	assertEquals(WHERE, rules, rtRules);
				5306	}
				5307
				5308	// Read file generated by
				5309	// https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
				5310	// as test cases and compare the Output.
				5311	// Format of the file
				5312	// Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
				5313	// Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
				5314	// Input:\t[source text]
				5315	// Output:\t[expected output separated by \| ]
				5316	// Input: ...
				5317	// Output: ...
				5318
				5319	void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
				5320	// The expectation in this test depends on LSTM, skip the test if the
				5321	// configuration is not build with LSTM data.
				5322	if (skipLSTMTest()) {
				5323	return;
				5324	}
				5325	UErrorCode status = U_ZERO_ERROR;
				5326	LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
				5327	if (U_FAILURE(status)) {
				5328	errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
				5329	return;
				5330	}
				5331	// Open and read the test data file.
				5332	const char *testDataDirectory = IntlTest::getSourceTestData(status);
				5333	CharString testFileName(testDataDirectory, -1, status);
				5334	testFileName.append(filename, -1, status);
				5335
				5336	int len;
				5337	UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
				5338	if (U_FAILURE(status)) {
				5339	errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
				5340	return;
				5341	}
				5342
				5343	// Put the test data into a UnicodeString
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	5344	UnicodeString testString(false, testFile, len);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5345
				5346	int32_t start = 0;
				5347
				5348	UnicodeString line;
				5349	int32_t end;
				5350	std::string actual_sep_str;
				5351	int32_t caseNum = 0;
				5352	// Iterate through all the lines in the test file.
				5353	do {
				5354	int32_t cr = testString.indexOf(u'\r', start);
				5355	int32_t lf = testString.indexOf(u'\n', start);
				5356	end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
				5357	line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
				5358	if (line.length() > 0) {
				5359	// Separate each line to key and value by TAB.
				5360	int32_t tab = line.indexOf(u'\t');
				5361	UnicodeString key = line.tempSubString(0, tab);
				5362	const UnicodeString value = line.tempSubString(tab+1);
				5363
				5364	if (key == "Model:") {
				5365	// Verify the expectation in the test file match the LSTM model
				5366	// we are using now.
				5367	const LSTMData* data = CreateLSTMDataForScript(script, status);
				5368	if (U_FAILURE(status)) {
				5369	dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
				5370	__FILE__, __LINE__, u_errorName(status), uscript_getName(script));
				5371	return;
				5372	}
				5373	UnicodeString name(LSTMDataName(data));
				5374	DeleteLSTMData(data);
				5375	if (value != name) {
				5376	std::string utf8Name, utf8Value;
				5377	dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
				5378	__FILE__, __LINE__, u_errorName(status), uscript_getName(script),
				5379	name.toUTF8String<std::string>(utf8Name).c_str(),
				5380	value.toUTF8String<std::string>(utf8Value).c_str());
				5381	return;
				5382	}
				5383	} else if (key == "Input:") {
				5384	UnicodeString input("prefix ");
				5385	input += value + " suffix";
				5386	std::stringstream ss;
				5387
				5388	// Construct the UText which is expected by the the engine as
				5389	// input from the UnicodeString.
				5390	UText ut = UTEXT_INITIALIZER;
				5391	utext_openConstUnicodeString(&ut, &input, &status);
				5392	if (U_FAILURE(status)) {
				5393	dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
				5394	return;
				5395	}
				5396
				5397	iterator->setText(&ut, status);
				5398	if (U_FAILURE(status)) {
				5399	errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
				5400	return;
				5401	}
				5402
				5403	int32_t bp;
				5404	for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
				5405	ss << bp;
				5406	if (bp != input.length()) {
				5407	ss << ", ";
				5408	}
				5409	}
				5410
				5411	utext_close(&ut);
				5412	// Turn the break points into a string for easy comparison
				5413	// output.
				5414	actual_sep_str = "{" + ss.str() + "}";
				5415	} else if (key == "Output:" && !actual_sep_str.empty()) {
				5416	UnicodeString input("prefix\| \|");
				5417	input += value + "\| \|suffix";
				5418	std::string d;
				5419	int32_t sep;
				5420	int32_t start = 0;
				5421	int32_t curr = 0;
				5422	std::stringstream ss;
				5423	// Include 0 as the break point.
				5424	ss << "0, ";
				5425	while ((sep = input.indexOf(u'\|', start)) >= 0) {
				5426	int32_t len = sep - start;
				5427	if (len > 0) {
				5428	if (curr > 0) {
				5429	ss << ", ";
				5430	}
				5431	curr += len;
				5432	ss << curr;
				5433	}
				5434	start = sep + 1;
				5435	}
				5436	// Include end of the string as break point.
				5437	ss << ", " << curr + input.length() - start;
				5438	// Turn the break points into a string for easy comparison
				5439	// output.
				5440	std::string expected = "{" + ss.str() + "}";
				5441	std::string utf8;
				5442
				5443	assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
				5444	expected.c_str(), actual_sep_str.c_str());
				5445	actual_sep_str.clear();
				5446	}
				5447	}
				5448	start = std::max(cr, lf) + 1;
				5449	} while (end >= 0);
				5450
				5451	delete [] testFile;
				5452	}
				5453
				5454	void RBBITest::TestLSTMThai() {
				5455	runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
				5456	}
				5457
				5458	void RBBITest::TestLSTMBurmese() {
				5459	runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
				5460	}
				5461
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	5462
				5463	// Test preceding(index) and following(index), with semi-random indexes.
				5464	// The random indexes are produced in clusters that are relatively closely spaced,
				5465	// to increase the occurrences of hits to the internal break cache.
				5466
				5467	void RBBITest::TestRandomAccess() {
				5468	static constexpr int32_t CACHE_SIZE = 128;
				5469
				5470	UnicodeString testData;
				5471	for (int i=0; i<CACHE_SIZE*2; ++i) {
				5472	testData.append(u"aaaa\n");
				5473	}
				5474
				5475	UErrorCode status = U_ZERO_ERROR;
				5476	LocalPointer<RuleBasedBreakIterator> bi(
				5477	(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
				5478	status);
				5479	if (!assertSuccess(WHERE, status)) { return; };
				5480
				5481	bi->setText(testData);
				5482
				5483	auto expectedPreceding = [](int from) {
				5484	if (from == 0) {return UBRK_DONE;}
				5485	if (from % 5 == 0) {return from - 5;}
				5486	return from - (from % 5);
				5487	};
				5488
				5489	auto expectedFollow = [testData](int from) {
				5490	if (from >= testData.length()) {return UBRK_DONE;}
				5491	if (from % 5 == 0) {return from + 5;}
				5492	return from + (5 - (from % 5));
				5493	};
				5494
				5495	auto randomStringIndex = [testData]() {
				5496	static icu_rand randomGenerator; // produces random uint32_t values.
				5497	static int lastNum;
				5498	static int clusterCount;
				5499	static constexpr int CLUSTER_SIZE = 100;
				5500	static constexpr int CLUSTER_LENGTH = 10;
				5501
				5502	if (clusterCount < CLUSTER_LENGTH) {
				5503	++clusterCount;
				5504	lastNum += (randomGenerator() % CLUSTER_SIZE);
				5505	lastNum -= CLUSTER_SIZE / 2;
				5506	lastNum = std::max(0, lastNum);
				5507	// Deliberately test indexes > testData.length.
				5508	lastNum = std::min(testData.length() + 5, lastNum);
				5509	} else {
				5510	clusterCount = 0;
				5511	lastNum = randomGenerator() % testData.length();
				5512	}
				5513	return lastNum;
				5514	};
				5515
				5516	for (int i=0; i<5000; ++i) {
				5517	int idx = randomStringIndex();
				5518	assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
				5519	idx = randomStringIndex();
				5520	assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
				5521	}
				5522	}
				5523
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5524	#endif // #if !UCONFIG_NO_BREAK_ITERATION