Blame - source/test/intltest/regextst.cpp - chromium.googlesource.com/chromium/deps/icu

blob: cb8565d9339303c8b3a28e4e5b8412357d06285f [file] [log] [blame]

Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/********************************************************************
				4	* COPYRIGHT:
				5	* Copyright (c) 2002-2016, International Business Machines Corporation and
				6	* others. All Rights Reserved.
				7	********************************************************************/
				8
				9	//
				10	// regextst.cpp
				11	//
				12	// ICU Regular Expressions test, part of intltest.
				13	//
				14
				15	/*
				16	NOTE!!
				17
				18	PLEASE be careful about ASCII assumptions in this test.
				19	This test is one of the worst repeat offenders.
				20	If you have questions, contact someone on the ICU PMC
				21	who has access to an EBCDIC system.
				22
				23	*/
				24
				25	#include "intltest.h"
				26	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
				27
				28	#include <stdlib.h>
				29	#include <stdio.h>
				30	#include <string.h>
				31
				32	#include "unicode/localpointer.h"
				33	#include "unicode/regex.h"
				34	#include "unicode/stringpiece.h"
				35	#include "unicode/uchar.h"
				36	#include "unicode/ucnv.h"
				37	#include "unicode/uniset.h"
				38	#include "unicode/uregex.h"
				39	#include "unicode/usetiter.h"
				40	#include "unicode/ustring.h"
				41	#include "unicode/utext.h"
				42	#include "unicode/utf16.h"
				43	#include "cstr.h"
				44	#include "regextst.h"
				45	#include "regexcmp.h"
				46	#include "uvector.h"
				47	#include "util.h"
				48	#include "cmemory.h"
				49	#include "cstring.h"
				50	#include "uinvchar.h"
				51
				52	#define SUPPORT_MUTATING_INPUT_STRING 0
				53
				54	//---------------------------------------------------------------------------
				55	//
				56	// Test class boilerplate
				57	//
				58	//---------------------------------------------------------------------------
				59	RegexTest::RegexTest()
				60	{
				61	}
				62
				63
				64	RegexTest::~RegexTest()
				65	{
				66	}
				67
				68
				69
				70	void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
				71	{
				72	if (exec) logln("TestSuite RegexTest: ");
				73	TESTCASE_AUTO_BEGIN;
				74	TESTCASE_AUTO(Basic);
				75	TESTCASE_AUTO(API_Match);
				76	TESTCASE_AUTO(API_Replace);
				77	TESTCASE_AUTO(API_Pattern);
				78	#if !UCONFIG_NO_FILE_IO
				79	TESTCASE_AUTO(Extended);
				80	#endif
				81	TESTCASE_AUTO(Errors);
				82	TESTCASE_AUTO(PerlTests);
				83	TESTCASE_AUTO(Callbacks);
				84	TESTCASE_AUTO(FindProgressCallbacks);
				85	TESTCASE_AUTO(Bug6149);
				86	TESTCASE_AUTO(UTextBasic);
				87	TESTCASE_AUTO(API_Match_UTF8);
				88	TESTCASE_AUTO(API_Replace_UTF8);
				89	TESTCASE_AUTO(API_Pattern_UTF8);
				90	TESTCASE_AUTO(PerlTestsUTF8);
				91	TESTCASE_AUTO(PreAllocatedUTextCAPI);
				92	TESTCASE_AUTO(Bug7651);
				93	TESTCASE_AUTO(Bug7740);
				94	TESTCASE_AUTO(Bug8479);
				95	TESTCASE_AUTO(Bug7029);
				96	TESTCASE_AUTO(CheckInvBufSize);
				97	TESTCASE_AUTO(Bug9283);
				98	TESTCASE_AUTO(Bug10459);
				99	TESTCASE_AUTO(TestCaseInsensitiveStarters);
				100	TESTCASE_AUTO(TestBug11049);
				101	TESTCASE_AUTO(TestBug11371);
				102	TESTCASE_AUTO(TestBug11480);
				103	TESTCASE_AUTO(NamedCapture);
				104	TESTCASE_AUTO(NamedCaptureLimits);
				105	TESTCASE_AUTO(TestBug12884);
				106	TESTCASE_AUTO(TestBug13631);
				107	TESTCASE_AUTO(TestBug13632);
				108	TESTCASE_AUTO(TestBug20359);
				109	TESTCASE_AUTO(TestBug20863);
				110	TESTCASE_AUTO_END;
				111	}
				112
				113
				114	/**
				115	* Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
				116	* into ASCII.
				117	* @see utext_openUTF8
				118	*/
				119	static UText* regextst_openUTF8FromInvariant(UText* ut, const char inv, int64_t length, UErrorCode status);
				120
				121	//---------------------------------------------------------------------------
				122	//
				123	// Error Checking / Reporting macros used in all of the tests.
				124	//
				125	//---------------------------------------------------------------------------
				126
				127	static void utextToPrintable(char buf, int32_t bufLen, UText text) {
				128	int64_t oldIndex = utext_getNativeIndex(text);
				129	utext_setNativeIndex(text, 0);
				130	char *bufPtr = buf;
				131	UChar32 c = utext_next32From(text, 0);
				132	while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
				133	if (0x000020<=c && c<0x00007e) {
				134	*bufPtr = c;
				135	} else {
				136	#if 0
				137	sprintf(bufPtr,"U+%04X", c);
				138	bufPtr+= strlen(bufPtr)-1;
				139	#else
				140	*bufPtr = '%';
				141	#endif
				142	}
				143	bufPtr++;
				144	c = UTEXT_NEXT32(text);
				145	}
				146	*bufPtr = 0;
				147	#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
				148	char ebuf = (char)malloc(bufLen);
				149	uprv_eastrncpy((unsigned char)ebuf, (const unsigned char)buf, bufLen);
				150	uprv_strncpy(buf, ebuf, bufLen);
				151	free((void*)ebuf);
				152	#endif
				153	utext_setNativeIndex(text, oldIndex);
				154	}
				155
				156
				157	static char ASSERT_BUF[1024];
				158
				159	const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
				160	if(message.length()==0) {
				161	strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
				162	} else {
				163	UnicodeString buf;
				164	IntlTest::prettify(message,buf);
				165	if(buf.length()==0) {
				166	strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
				167	} else {
				168	buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
				169	if(ASSERT_BUF[0]==0) {
				170	ASSERT_BUF[0]=0;
				171	for(int32_t i=0;i<buf.length();i++) {
				172	UChar ch = buf[i];
				173	sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
				174	}
				175	}
				176	}
				177	}
				178	ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
				179	return ASSERT_BUF;
				180	}
				181
				182	#define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
				183	char buf[200]; \
				184	utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
				185	logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
				186	} UPRV_BLOCK_MACRO_END
				187
				188	#define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
				189	if (U_FAILURE(status)) { \
				190	dataerrln("%s:%d: RegexTest failure. status=%s", \
				191	__FILE__, __LINE__, u_errorName(status)); \
				192	return; \
				193	} \
				194	} UPRV_BLOCK_MACRO_END
				195
				196	#define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	197	if ((expr)==false) { \
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	198	errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
				199	} \
				200	} UPRV_BLOCK_MACRO_END
				201
				202	#define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
				203	UErrorCode status=U_ZERO_ERROR; \
				204	(expr); \
				205	if (status!=errcode) { \
				206	dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
				207	__LINE__, u_errorName(errcode), u_errorName(status)); \
				208	} \
				209	} UPRV_BLOCK_MACRO_END
				210
				211	#define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
				212	if (U_FAILURE(status)) { \
				213	errln("RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); \
				214	} \
				215	} UPRV_BLOCK_MACRO_END
				216
				217	#define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	218	if ((expr)==false) { \
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	219	errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
				220	return; \
				221	} \
				222	} UPRV_BLOCK_MACRO_END
				223
				224	// expected: const char * , restricted to invariant characters.
				225	// actual: const UnicodeString &
				226	#define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
				227	if (UnicodeString(expected, -1, US_INV) != (actual)) { \
				228	errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
				229	__FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
				230	} \
				231	} UPRV_BLOCK_MACRO_END
				232
				233
				234	static UBool testUTextEqual(UText uta, UText utb) {
				235	UChar32 ca = 0;
				236	UChar32 cb = 0;
				237	utext_setNativeIndex(uta, 0);
				238	utext_setNativeIndex(utb, 0);
				239	do {
				240	ca = utext_next32(uta);
				241	cb = utext_next32(utb);
				242	if (ca != cb) {
				243	break;
				244	}
				245	} while (ca != U_SENTINEL);
				246	return ca == cb;
				247	}
				248
				249
				250	/**
				251	* @param expected expected text in UTF-8 (not platform) codepage
				252	*/
				253	void RegexTest::assertUText(const char expected, UText actual, const char *file, int line) {
				254	UErrorCode status = U_ZERO_ERROR;
				255	UText expectedText = UTEXT_INITIALIZER;
				256	utext_openUTF8(&expectedText, expected, -1, &status);
				257	if(U_FAILURE(status)) {
				258	errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
				259	return;
				260	}
				261	if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
				262	errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
				263	return;
				264	}
				265	utext_setNativeIndex(actual, 0);
				266	if (!testUTextEqual(&expectedText, actual)) {
				267	char buf[201 /21/];
				268	char expectedBuf[201];
				269	utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
				270	utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
				271	errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
				272	}
				273	utext_close(&expectedText);
				274	}
				275	/**
				276	* @param expected invariant (platform local text) input
				277	*/
				278
				279	void RegexTest::assertUTextInvariant(const char expected, UText actual, const char *file, int line) {
				280	UErrorCode status = U_ZERO_ERROR;
				281	UText expectedText = UTEXT_INITIALIZER;
				282	regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
				283	if(U_FAILURE(status)) {
				284	errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
				285	return;
				286	}
				287	utext_setNativeIndex(actual, 0);
				288	if (!testUTextEqual(&expectedText, actual)) {
				289	char buf[201 /21/];
				290	char expectedBuf[201];
				291	utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
				292	utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
				293	errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
				294	}
				295	utext_close(&expectedText);
				296	}
				297
				298	/**
				299	* Assumes utf-8 input
				300	*/
				301	#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
				302	/**
				303	* Assumes Invariant input
				304	*/
				305	#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
				306
				307	/**
				308	* This buffer ( inv_buf ) is used to hold the UTF-8 strings
				309	* passed into utext_openUTF8. An error will be given if
				310	* INV_BUFSIZ is too small. It's only used on EBCDIC systems.
				311	*/
				312
				313	#define INV_BUFSIZ 2048 /* increase this if too small */
				314
				315	static int64_t inv_next=0;
				316
				317	#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
				318	static char inv_buf[INV_BUFSIZ];
				319	#endif
				320
				321	static UText* regextst_openUTF8FromInvariant(UText ut, const char inv, int64_t length, UErrorCode *status) {
				322	if(length==-1) length=strlen(inv);
				323	#if U_CHARSET_FAMILY==U_ASCII_FAMILY
				324	inv_next+=length;
				325	return utext_openUTF8(ut, inv, length, status);
				326	#else
				327	if(inv_next+length+1>INV_BUFSIZ) {
				328	fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
				329	__FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
				330	*status = U_MEMORY_ALLOCATION_ERROR;
				331	return NULL;
				332	}
				333
				334	unsigned char buf = (unsigned char)inv_buf+inv_next;
				335	uprv_aestrncpy(buf, (const uint8_t*)inv, length);
				336	inv_next+=length;
				337
				338	#if 0
				339	fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
				340	#endif
				341
				342	return utext_openUTF8(ut, (const char*)buf, length, status);
				343	#endif
				344	}
				345
				346
				347	//---------------------------------------------------------------------------
				348	//
				349	// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
				350	// for the LookingAt() and Match() functions.
				351	//
				352	// usage:
				353	// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
				354	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	355	// The expected results are UBool - true or false.
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	356	// The input text is unescaped. The pattern is not.
				357	//
				358	//
				359	//---------------------------------------------------------------------------
				360
				361	#define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
				362	doRegexLMTest(pat, text, looking, match, __LINE__); \
				363	doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
				364	} UPRV_BLOCK_MACRO_END
				365
				366	UBool RegexTest::doRegexLMTest(const char pat, const char text, UBool looking, UBool match, int32_t line) {
				367	const UnicodeString pattern(pat, -1, US_INV);
				368	const UnicodeString inputText(text, -1, US_INV);
				369	UErrorCode status = U_ZERO_ERROR;
				370	UParseError pe;
				371	RegexPattern *REPattern = NULL;
				372	RegexMatcher *REMatcher = NULL;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	373	UBool retVal = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	374
				375	UnicodeString patString(pat, -1, US_INV);
				376	REPattern = RegexPattern::compile(patString, 0, pe, status);
				377	if (U_FAILURE(status)) {
				378	dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
				379	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	380	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	381	}
				382	if (line==376) { REPattern->dumpPattern();}
				383
				384	UnicodeString inputString(inputText);
				385	UnicodeString unEscapedInput = inputString.unescape();
				386	REMatcher = REPattern->matcher(unEscapedInput, status);
				387	if (U_FAILURE(status)) {
				388	errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
				389	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	390	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	391	}
				392
				393	UBool actualmatch;
				394	actualmatch = REMatcher->lookingAt(status);
				395	if (U_FAILURE(status)) {
				396	errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
				397	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	398	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	399	}
				400	if (actualmatch != looking) {
				401	errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	402	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	403	}
				404
				405	status = U_ZERO_ERROR;
				406	actualmatch = REMatcher->matches(status);
				407	if (U_FAILURE(status)) {
				408	errln("RegexTest failure in matches() at line %d. Status = %s\n",
				409	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	410	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	411	}
				412	if (actualmatch != match) {
				413	errln("RegexTest: wrong return from matches() at line %d.\n", line);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	414	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	415	}
				416
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	417	if (retVal == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	418	REPattern->dumpPattern();
				419	}
				420
				421	delete REPattern;
				422	delete REMatcher;
				423	return retVal;
				424	}
				425
				426
				427	UBool RegexTest::doRegexLMTestUTF8(const char pat, const char text, UBool looking, UBool match, int32_t line) {
				428	UText pattern = UTEXT_INITIALIZER;
				429	int32_t inputUTF8Length;
				430	char *textChars = NULL;
				431	UText inputText = UTEXT_INITIALIZER;
				432	UErrorCode status = U_ZERO_ERROR;
				433	UParseError pe;
				434	RegexPattern *REPattern = NULL;
				435	RegexMatcher *REMatcher = NULL;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	436	UBool retVal = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	437
				438	regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
				439	REPattern = RegexPattern::compile(&pattern, 0, pe, status);
				440	if (U_FAILURE(status)) {
				441	dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
				442	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	443	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	444	}
				445
				446	UnicodeString inputString(text, -1, US_INV);
				447	UnicodeString unEscapedInput = inputString.unescape();
				448	LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
				449	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
				450
				451	inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
				452	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
				453	// UTF-8 does not allow unpaired surrogates, so this could actually happen
				454	logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	455	return true; // not a failure of the Regex engine
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	456	}
				457	status = U_ZERO_ERROR; // buffer overflow
				458	textChars = new char[inputUTF8Length+1];
				459	unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
				460	utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
				461
				462	REMatcher = &REPattern->matcher(status)->reset(&inputText);
				463	if (U_FAILURE(status)) {
				464	errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
				465	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	466	return false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	467	}
				468
				469	UBool actualmatch;
				470	actualmatch = REMatcher->lookingAt(status);
				471	if (U_FAILURE(status)) {
				472	errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
				473	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	474	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	475	}
				476	if (actualmatch != looking) {
				477	errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	478	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	479	}
				480
				481	status = U_ZERO_ERROR;
				482	actualmatch = REMatcher->matches(status);
				483	if (U_FAILURE(status)) {
				484	errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
				485	line, u_errorName(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	486	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	487	}
				488	if (actualmatch != match) {
				489	errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	490	retVal = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	491	}
				492
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	493	if (retVal == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	494	REPattern->dumpPattern();
				495	}
				496
				497	delete REPattern;
				498	delete REMatcher;
				499	utext_close(&inputText);
				500	utext_close(&pattern);
				501	delete[] textChars;
				502	return retVal;
				503	}
				504
				505
				506
				507	//---------------------------------------------------------------------------
				508	//
				509	// REGEX_ERR Macro + invocation function to simplify writing tests
				510	// regex tests for incorrect patterns
				511	//
				512	// usage:
				513	// REGEX_ERR("pattern", expected error line, column, expected status);
				514	//
				515	//---------------------------------------------------------------------------
				516	#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
				517
				518	void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
				519	UErrorCode expectedStatus, int32_t line) {
				520	UnicodeString pattern(pat);
				521
				522	UErrorCode status = U_ZERO_ERROR;
				523	UParseError pe;
				524	RegexPattern *callerPattern = NULL;
				525
				526	//
				527	// Compile the caller's pattern
				528	//
				529	UnicodeString patString(pat);
				530	callerPattern = RegexPattern::compile(patString, 0, pe, status);
				531	if (status != expectedStatus) {
				532	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
				533	} else {
				534	if (status != U_ZERO_ERROR) {
				535	if (pe.line != errLine \|\| pe.offset != errCol) {
				536	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
				537	line, errLine, errCol, pe.line, pe.offset);
				538	}
				539	}
				540	}
				541
				542	delete callerPattern;
				543
				544	//
				545	// Compile again, using a UTF-8-based UText
				546	//
				547	UText patternText = UTEXT_INITIALIZER;
				548	regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
				549	callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
				550	if (status != expectedStatus) {
				551	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
				552	} else {
				553	if (status != U_ZERO_ERROR) {
				554	if (pe.line != errLine \|\| pe.offset != errCol) {
				555	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
				556	line, errLine, errCol, pe.line, pe.offset);
				557	}
				558	}
				559	}
				560
				561	delete callerPattern;
				562	utext_close(&patternText);
				563	}
				564
				565
				566
				567	//---------------------------------------------------------------------------
				568	//
				569	// Basic Check for basic functionality of regex pattern matching.
				570	// Avoid the use of REGEX_FIND test macro, which has
				571	// substantial dependencies on basic Regex functionality.
				572	//
				573	//---------------------------------------------------------------------------
				574	void RegexTest::Basic() {
				575
				576
				577	//
				578	// Debug - slide failing test cases early
				579	//
				580	#if 0
				581	{
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	582	// REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", false, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	583	UParseError pe;
				584	UErrorCode status = U_ZERO_ERROR;
				585	RegexPattern *pattern;
				586	pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
				587	pattern->dumpPattern();
				588	RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
				589	UBool result = m->find();
				590	printf("result = %d\n", result);
				591	// REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
				592	// REGEX_FIND("(X([abc=X]+)+X)\|(y[abc=]+)", "=XX====================");
				593	}
				594	exit(1);
				595	#endif
				596
				597
				598	//
				599	// Pattern with parentheses
				600	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	601	REGEX_TESTLM("st(abc)ring", "stabcring thing", true, false);
				602	REGEX_TESTLM("st(abc)ring", "stabcring", true, true);
				603	REGEX_TESTLM("st(abc)ring", "stabcrung", false, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	604
				605	//
				606	// Patterns with *
				607	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	608	REGEX_TESTLM("st(abc)*ring", "string", true, true);
				609	REGEX_TESTLM("st(abc)*ring", "stabcring", true, true);
				610	REGEX_TESTLM("st(abc)*ring", "stabcabcring", true, true);
				611	REGEX_TESTLM("st(abc)*ring", "stabcabcdring", false, false);
				612	REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", true, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	613
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	614	REGEX_TESTLM("a*", "", true, true);
				615	REGEX_TESTLM("a*", "b", true, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	616
				617
				618	//
				619	// Patterns with "."
				620	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	621	REGEX_TESTLM(".", "abc", true, false);
				622	REGEX_TESTLM("...", "abc", true, true);
				623	REGEX_TESTLM("....", "abc", false, false);
				624	REGEX_TESTLM(".*", "abcxyz123", true, true);
				625	REGEX_TESTLM("ab.*xyz", "abcdefghij", false, false);
				626	REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", true, true);
				627	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", true, true);
				628	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", true, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	629
				630	//
				631	// Patterns with * applied to chars at end of literal string
				632	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	633	REGEX_TESTLM("abc*", "ab", true, true);
				634	REGEX_TESTLM("abc*", "abccccc", true, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	635
				636	//
				637	// Supplemental chars match as single chars, not a pair of surrogates.
				638	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	639	REGEX_TESTLM(".", "\\U00011000", true, true);
				640	REGEX_TESTLM("...", "\\U00011000x\\U00012002", true, true);
				641	REGEX_TESTLM("...", "\\U00011000x\\U00012002y", true, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	642
				643
				644	//
				645	// UnicodeSets in the pattern
				646	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	647	REGEX_TESTLM("[1-6]", "1", true, true);
				648	REGEX_TESTLM("[1-6]", "3", true, true);
				649	REGEX_TESTLM("[1-6]", "7", false, false);
				650	REGEX_TESTLM("a[1-6]", "a3", true, true);
				651	REGEX_TESTLM("a[1-6]", "a3", true, true);
				652	REGEX_TESTLM("a[1-6]b", "a3b", true, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	653
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	654	REGEX_TESTLM("a[0-9]*b", "a123b", true, true);
				655	REGEX_TESTLM("a[0-9]*b", "abc", true, false);
				656	REGEX_TESTLM("[\\p{Nd}]*", "123456", true, true);
				657	REGEX_TESTLM("[\\p{Nd}]", "a123456", true, false); // note that matches 0 occurrences.
				658	REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", true, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	659
				660	//
				661	// OR operator in patterns
				662	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	663	REGEX_TESTLM("(a\|b)", "a", true, true);
				664	REGEX_TESTLM("(a\|b)", "b", true, true);
				665	REGEX_TESTLM("(a\|b)", "c", false, false);
				666	REGEX_TESTLM("a\|b", "b", true, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	667
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	668	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabc", true, true);
				669	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabdc", true, false);
				670	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "ac", true, true);
				671	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "123", true, true);
				672	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "123", true, true);
				673	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "222211111czzzzw", true, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	674
				675	//
				676	// +
				677	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	678	REGEX_TESTLM("ab+", "abbc", true, false);
				679	REGEX_TESTLM("ab+c", "ac", false, false);
				680	REGEX_TESTLM("b+", "", false, false);
				681	REGEX_TESTLM("(abc\|def)+", "defabc", true, true);
				682	REGEX_TESTLM(".+y", "zippity dooy dah ", true, false);
				683	REGEX_TESTLM(".+y", "zippity dooy", true, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	684
				685	//
				686	// ?
				687	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	688	REGEX_TESTLM("ab?", "ab", true, true);
				689	REGEX_TESTLM("ab?", "a", true, true);
				690	REGEX_TESTLM("ab?", "ac", true, false);
				691	REGEX_TESTLM("ab?", "abb", true, false);
				692	REGEX_TESTLM("a(b\|c)?d", "abd", true, true);
				693	REGEX_TESTLM("a(b\|c)?d", "acd", true, true);
				694	REGEX_TESTLM("a(b\|c)?d", "ad", true, true);
				695	REGEX_TESTLM("a(b\|c)?d", "abcd", false, false);
				696	REGEX_TESTLM("a(b\|c)?d", "ab", false, false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	697
				698	//
				699	// Escape sequences that become single literal chars, handled internally
				700	// by ICU's Unescape.
				701	//
				702
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	703	// REGEX_TESTLM("\101\142", "Ab", true, true); // Octal TODO: not implemented yet.
				704	REGEX_TESTLM("\\a", "\\u0007", true, true); // BEL
				705	REGEX_TESTLM("\\cL", "\\u000c", true, true); // Control-L
				706	REGEX_TESTLM("\\e", "\\u001b", true, true); // Escape
				707	REGEX_TESTLM("\\f", "\\u000c", true, true); // Form Feed
				708	REGEX_TESTLM("\\n", "\\u000a", true, true); // new line
				709	REGEX_TESTLM("\\r", "\\u000d", true, true); // CR
				710	REGEX_TESTLM("\\t", "\\u0009", true, true); // Tab
				711	REGEX_TESTLM("\\u1234", "\\u1234", true, true);
				712	REGEX_TESTLM("\\U00001234", "\\u1234", true, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	713
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	714	REGEX_TESTLM(".*\\Ax", "xyz", true, false); // \A matches only at the beginning of input
				715	REGEX_TESTLM(".*\\Ax", " xyz", false, false); // \A matches only at the beginning of input
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	716
				717	// Escape of special chars in patterns
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	718	REGEX_TESTLM("\\\\\\\|\$\$\\[\\{\\~\\$\\\\+\\?\\.", "\\\\\|()[{~$+?.", true, true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	719	}
				720
				721
				722	//---------------------------------------------------------------------------
				723	//
				724	// UTextBasic Check for quirks that are specific to the UText
				725	// implementation.
				726	//
				727	//---------------------------------------------------------------------------
				728	void RegexTest::UTextBasic() {
				729	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
				730	UErrorCode status = U_ZERO_ERROR;
				731	UText pattern = UTEXT_INITIALIZER;
				732	utext_openUTF8(&pattern, str_abc, -1, &status);
				733	RegexMatcher matcher(&pattern, 0, status);
				734	REGEX_CHECK_STATUS;
				735
				736	UText input = UTEXT_INITIALIZER;
				737	utext_openUTF8(&input, str_abc, -1, &status);
				738	REGEX_CHECK_STATUS;
				739	matcher.reset(&input);
				740	REGEX_CHECK_STATUS;
				741	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
				742
				743	matcher.reset(matcher.inputText());
				744	REGEX_CHECK_STATUS;
				745	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
				746
				747	utext_close(&pattern);
				748	utext_close(&input);
				749	}
				750
				751
				752	//---------------------------------------------------------------------------
				753	//
				754	// API_Match Test that the API for class RegexMatcher
				755	// is present and nominally working, but excluding functions
				756	// implementing replace operations.
				757	//
				758	//---------------------------------------------------------------------------
				759	void RegexTest::API_Match() {
				760	UParseError pe;
				761	UErrorCode status=U_ZERO_ERROR;
				762	int32_t flags = 0;
				763
				764	//
				765	// Debug - slide failing test cases early
				766	//
				767	#if 0
				768	{
				769	}
				770	return;
				771	#endif
				772
				773	//
				774	// Simple pattern compilation
				775	//
				776	{
				777	UnicodeString re("abc");
				778	RegexPattern *pat2;
				779	pat2 = RegexPattern::compile(re, flags, pe, status);
				780	REGEX_CHECK_STATUS;
				781
				782	UnicodeString inStr1 = "abcdef this is a test";
				783	UnicodeString instr2 = "not abc";
				784	UnicodeString empty = "";
				785
				786
				787	//
				788	// Matcher creation and reset.
				789	//
				790	RegexMatcher *m1 = pat2->matcher(inStr1, status);
				791	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	792	REGEX_ASSERT(m1->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	793	REGEX_ASSERT(m1->input() == inStr1);
				794	m1->reset(instr2);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	795	REGEX_ASSERT(m1->lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	796	REGEX_ASSERT(m1->input() == instr2);
				797	m1->reset(inStr1);
				798	REGEX_ASSERT(m1->input() == inStr1);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	799	REGEX_ASSERT(m1->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	800	m1->reset(empty);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	801	REGEX_ASSERT(m1->lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	802	REGEX_ASSERT(m1->input() == empty);
				803	REGEX_ASSERT(&m1->pattern() == pat2);
				804
				805	//
				806	// reset(pos, status)
				807	//
				808	m1->reset(inStr1);
				809	m1->reset(4, status);
				810	REGEX_CHECK_STATUS;
				811	REGEX_ASSERT(m1->input() == inStr1);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	812	REGEX_ASSERT(m1->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	813
				814	m1->reset(-1, status);
				815	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				816	status = U_ZERO_ERROR;
				817
				818	m1->reset(0, status);
				819	REGEX_CHECK_STATUS;
				820	status = U_ZERO_ERROR;
				821
				822	int32_t len = m1->input().length();
				823	m1->reset(len-1, status);
				824	REGEX_CHECK_STATUS;
				825	status = U_ZERO_ERROR;
				826
				827	m1->reset(len, status);
				828	REGEX_CHECK_STATUS;
				829	status = U_ZERO_ERROR;
				830
				831	m1->reset(len+1, status);
				832	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				833	status = U_ZERO_ERROR;
				834
				835	//
				836	// match(pos, status)
				837	//
				838	m1->reset(instr2);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	839	REGEX_ASSERT(m1->matches(4, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	840	m1->reset();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	841	REGEX_ASSERT(m1->matches(3, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	842	m1->reset();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	843	REGEX_ASSERT(m1->matches(5, status) == false);
				844	REGEX_ASSERT(m1->matches(4, status) == true);
				845	REGEX_ASSERT(m1->matches(-1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	846	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				847
				848	// Match() at end of string should fail, but should not
				849	// be an error.
				850	status = U_ZERO_ERROR;
				851	len = m1->input().length();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	852	REGEX_ASSERT(m1->matches(len, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	853	REGEX_CHECK_STATUS;
				854
				855	// Match beyond end of string should fail with an error.
				856	status = U_ZERO_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	857	REGEX_ASSERT(m1->matches(len+1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	858	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				859
				860	// Successful match at end of string.
				861	{
				862	status = U_ZERO_ERROR;
				863	RegexMatcher m("A?", 0, status); // will match zero length string.
				864	REGEX_CHECK_STATUS;
				865	m.reset(inStr1);
				866	len = inStr1.length();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	867	REGEX_ASSERT(m.matches(len, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	868	REGEX_CHECK_STATUS;
				869	m.reset(empty);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	870	REGEX_ASSERT(m.matches(0, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	871	REGEX_CHECK_STATUS;
				872	}
				873
				874
				875	//
				876	// lookingAt(pos, status)
				877	//
				878	status = U_ZERO_ERROR;
				879	m1->reset(instr2); // "not abc"
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	880	REGEX_ASSERT(m1->lookingAt(4, status) == true);
				881	REGEX_ASSERT(m1->lookingAt(5, status) == false);
				882	REGEX_ASSERT(m1->lookingAt(3, status) == false);
				883	REGEX_ASSERT(m1->lookingAt(4, status) == true);
				884	REGEX_ASSERT(m1->lookingAt(-1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	885	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				886	status = U_ZERO_ERROR;
				887	len = m1->input().length();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	888	REGEX_ASSERT(m1->lookingAt(len, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	889	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	890	REGEX_ASSERT(m1->lookingAt(len+1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	891	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				892
				893	delete m1;
				894	delete pat2;
				895	}
				896
				897
				898	//
				899	// Capture Group.
				900	// RegexMatcher::start();
				901	// RegexMatcher::end();
				902	// RegexMatcher::groupCount();
				903	//
				904	{
				905	int32_t flags=0;
				906	UParseError pe;
				907	UErrorCode status=U_ZERO_ERROR;
				908
				909	UnicodeString re("01(23(45)67)(.*)");
				910	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
				911	REGEX_CHECK_STATUS;
				912	UnicodeString data = "0123456789";
				913
				914	RegexMatcher *matcher = pat->matcher(data, status);
				915	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	916	REGEX_ASSERT(matcher->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	917	static const int32_t matchStarts[] = {0, 2, 4, 8};
				918	static const int32_t matchEnds[] = {10, 8, 6, 10};
				919	int32_t i;
				920	for (i=0; i<4; i++) {
				921	int32_t actualStart = matcher->start(i, status);
				922	REGEX_CHECK_STATUS;
				923	if (actualStart != matchStarts[i]) {
				924	errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
				925	__LINE__, i, matchStarts[i], actualStart);
				926	}
				927	int32_t actualEnd = matcher->end(i, status);
				928	REGEX_CHECK_STATUS;
				929	if (actualEnd != matchEnds[i]) {
				930	errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
				931	__LINE__, i, matchEnds[i], actualEnd);
				932	}
				933	}
				934
				935	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
				936	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
				937
				938	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
				939	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
				940	matcher->reset();
				941	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
				942
				943	matcher->lookingAt(status);
				944	REGEX_ASSERT(matcher->group(status) == "0123456789");
				945	REGEX_ASSERT(matcher->group(0, status) == "0123456789");
				946	REGEX_ASSERT(matcher->group(1, status) == "234567" );
				947	REGEX_ASSERT(matcher->group(2, status) == "45" );
				948	REGEX_ASSERT(matcher->group(3, status) == "89" );
				949	REGEX_CHECK_STATUS;
				950	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
				951	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
				952	matcher->reset();
				953	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
				954
				955	delete matcher;
				956	delete pat;
				957
				958	}
				959
				960	//
				961	// find
				962	//
				963	{
				964	int32_t flags=0;
				965	UParseError pe;
				966	UErrorCode status=U_ZERO_ERROR;
				967
				968	UnicodeString re("abc");
				969	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
				970	REGEX_CHECK_STATUS;
				971	UnicodeString data = ".abc..abc...abc..";
				972	// 012345678901234567
				973
				974	RegexMatcher *matcher = pat->matcher(data, status);
				975	REGEX_CHECK_STATUS;
				976	REGEX_ASSERT(matcher->find());
				977	REGEX_ASSERT(matcher->start(status) == 1);
				978	REGEX_ASSERT(matcher->find());
				979	REGEX_ASSERT(matcher->start(status) == 6);
				980	REGEX_ASSERT(matcher->find());
				981	REGEX_ASSERT(matcher->start(status) == 12);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	982	REGEX_ASSERT(matcher->find() == false);
				983	REGEX_ASSERT(matcher->find() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	984
				985	matcher->reset();
				986	REGEX_ASSERT(matcher->find());
				987	REGEX_ASSERT(matcher->start(status) == 1);
				988
				989	REGEX_ASSERT(matcher->find(0, status));
				990	REGEX_ASSERT(matcher->start(status) == 1);
				991	REGEX_ASSERT(matcher->find(1, status));
				992	REGEX_ASSERT(matcher->start(status) == 1);
				993	REGEX_ASSERT(matcher->find(2, status));
				994	REGEX_ASSERT(matcher->start(status) == 6);
				995	REGEX_ASSERT(matcher->find(12, status));
				996	REGEX_ASSERT(matcher->start(status) == 12);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	997	REGEX_ASSERT(matcher->find(13, status) == false);
				998	REGEX_ASSERT(matcher->find(16, status) == false);
				999	REGEX_ASSERT(matcher->find(17, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1000	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
				1001
				1002	status = U_ZERO_ERROR;
				1003	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
				1004	status = U_ZERO_ERROR;
				1005	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
				1006
				1007	REGEX_ASSERT(matcher->groupCount() == 0);
				1008
				1009	delete matcher;
				1010	delete pat;
				1011	}
				1012
				1013
				1014	//
				1015	// find, with \G in pattern (true if at the end of a previous match).
				1016	//
				1017	{
				1018	int32_t flags=0;
				1019	UParseError pe;
				1020	UErrorCode status=U_ZERO_ERROR;
				1021
				1022	UnicodeString re(".*?(?:(\\Gabc)\|(abc))", -1, US_INV);
				1023	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
				1024	REGEX_CHECK_STATUS;
				1025	UnicodeString data = ".abcabc.abc..";
				1026	// 012345678901234567
				1027
				1028	RegexMatcher *matcher = pat->matcher(data, status);
				1029	REGEX_CHECK_STATUS;
				1030	REGEX_ASSERT(matcher->find());
				1031	REGEX_ASSERT(matcher->start(status) == 0);
				1032	REGEX_ASSERT(matcher->start(1, status) == -1);
				1033	REGEX_ASSERT(matcher->start(2, status) == 1);
				1034
				1035	REGEX_ASSERT(matcher->find());
				1036	REGEX_ASSERT(matcher->start(status) == 4);
				1037	REGEX_ASSERT(matcher->start(1, status) == 4);
				1038	REGEX_ASSERT(matcher->start(2, status) == -1);
				1039	REGEX_CHECK_STATUS;
				1040
				1041	delete matcher;
				1042	delete pat;
				1043	}
				1044
				1045	//
				1046	// find with zero length matches, match position should bump ahead
				1047	// to prevent loops.
				1048	//
				1049	{
				1050	int32_t i;
				1051	UErrorCode status=U_ZERO_ERROR;
				1052	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
				1053	// using an always-true look-ahead.
				1054	REGEX_CHECK_STATUS;
				1055	UnicodeString s(" ");
				1056	m.reset(s);
				1057	for (i=0; ; i++) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1058	if (m.find() == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1059	break;
				1060	}
				1061	REGEX_ASSERT(m.start(status) == i);
				1062	REGEX_ASSERT(m.end(status) == i);
				1063	}
				1064	REGEX_ASSERT(i==5);
				1065
				1066	// Check that the bump goes over surrogate pairs OK
				1067	s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
				1068	s = s.unescape();
				1069	m.reset(s);
				1070	for (i=0; ; i+=2) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1071	if (m.find() == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1072	break;
				1073	}
				1074	REGEX_ASSERT(m.start(status) == i);
				1075	REGEX_ASSERT(m.end(status) == i);
				1076	}
				1077	REGEX_ASSERT(i==10);
				1078	}
				1079	{
				1080	// find() loop breaking test.
				1081	// with pattern of /.?/, should see a series of one char matches, then a single
				1082	// match of zero length at the end of the input string.
				1083	int32_t i;
				1084	UErrorCode status=U_ZERO_ERROR;
				1085	RegexMatcher m(".?", 0, status);
				1086	REGEX_CHECK_STATUS;
				1087	UnicodeString s(" ");
				1088	m.reset(s);
				1089	for (i=0; ; i++) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1090	if (m.find() == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1091	break;
				1092	}
				1093	REGEX_ASSERT(m.start(status) == i);
				1094	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
				1095	}
				1096	REGEX_ASSERT(i==5);
				1097	}
				1098
				1099
				1100	//
				1101	// Matchers with no input string behave as if they had an empty input string.
				1102	//
				1103
				1104	{
				1105	UErrorCode status = U_ZERO_ERROR;
				1106	RegexMatcher m(".?", 0, status);
				1107	REGEX_CHECK_STATUS;
				1108	REGEX_ASSERT(m.find());
				1109	REGEX_ASSERT(m.start(status) == 0);
				1110	REGEX_ASSERT(m.input() == "");
				1111	}
				1112	{
				1113	UErrorCode status = U_ZERO_ERROR;
				1114	RegexPattern *p = RegexPattern::compile(".", 0, status);
				1115	RegexMatcher *m = p->matcher(status);
				1116	REGEX_CHECK_STATUS;
				1117
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1118	REGEX_ASSERT(m->find() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1119	REGEX_ASSERT(m->input() == "");
				1120	delete m;
				1121	delete p;
				1122	}
				1123
				1124	//
				1125	// Regions
				1126	//
				1127	{
				1128	UErrorCode status = U_ZERO_ERROR;
				1129	UnicodeString testString("This is test data");
				1130	RegexMatcher m(".*", testString, 0, status);
				1131	REGEX_CHECK_STATUS;
				1132	REGEX_ASSERT(m.regionStart() == 0);
				1133	REGEX_ASSERT(m.regionEnd() == testString.length());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1134	REGEX_ASSERT(m.hasTransparentBounds() == false);
				1135	REGEX_ASSERT(m.hasAnchoringBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1136
				1137	m.region(2,4, status);
				1138	REGEX_CHECK_STATUS;
				1139	REGEX_ASSERT(m.matches(status));
				1140	REGEX_ASSERT(m.start(status)==2);
				1141	REGEX_ASSERT(m.end(status)==4);
				1142	REGEX_CHECK_STATUS;
				1143
				1144	m.reset();
				1145	REGEX_ASSERT(m.regionStart() == 0);
				1146	REGEX_ASSERT(m.regionEnd() == testString.length());
				1147
				1148	UnicodeString shorterString("short");
				1149	m.reset(shorterString);
				1150	REGEX_ASSERT(m.regionStart() == 0);
				1151	REGEX_ASSERT(m.regionEnd() == shorterString.length());
				1152
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1153	REGEX_ASSERT(m.hasAnchoringBounds() == true);
				1154	REGEX_ASSERT(&m == &m.useAnchoringBounds(false));
				1155	REGEX_ASSERT(m.hasAnchoringBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1156	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1157	REGEX_ASSERT(m.hasAnchoringBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1158
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1159	REGEX_ASSERT(&m == &m.useAnchoringBounds(true));
				1160	REGEX_ASSERT(m.hasAnchoringBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1161	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1162	REGEX_ASSERT(m.hasAnchoringBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1163
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1164	REGEX_ASSERT(m.hasTransparentBounds() == false);
				1165	REGEX_ASSERT(&m == &m.useTransparentBounds(true));
				1166	REGEX_ASSERT(m.hasTransparentBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1167	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1168	REGEX_ASSERT(m.hasTransparentBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1169
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1170	REGEX_ASSERT(&m == &m.useTransparentBounds(false));
				1171	REGEX_ASSERT(m.hasTransparentBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1172	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1173	REGEX_ASSERT(m.hasTransparentBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1174
				1175	}
				1176
				1177	//
				1178	// hitEnd() and requireEnd()
				1179	//
				1180	{
				1181	UErrorCode status = U_ZERO_ERROR;
				1182	UnicodeString testString("aabb");
				1183	RegexMatcher m1(".*", testString, 0, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1184	REGEX_ASSERT(m1.lookingAt(status) == true);
				1185	REGEX_ASSERT(m1.hitEnd() == true);
				1186	REGEX_ASSERT(m1.requireEnd() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1187	REGEX_CHECK_STATUS;
				1188
				1189	status = U_ZERO_ERROR;
				1190	RegexMatcher m2("a*", testString, 0, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1191	REGEX_ASSERT(m2.lookingAt(status) == true);
				1192	REGEX_ASSERT(m2.hitEnd() == false);
				1193	REGEX_ASSERT(m2.requireEnd() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1194	REGEX_CHECK_STATUS;
				1195
				1196	status = U_ZERO_ERROR;
				1197	RegexMatcher m3(".*$", testString, 0, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1198	REGEX_ASSERT(m3.lookingAt(status) == true);
				1199	REGEX_ASSERT(m3.hitEnd() == true);
				1200	REGEX_ASSERT(m3.requireEnd() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1201	REGEX_CHECK_STATUS;
				1202	}
				1203
				1204
				1205	//
				1206	// Compilation error on reset with UChar *
				1207	// These were a hazard that people were stumbling over with runtime errors.
				1208	// Changed them to compiler errors by adding private methods that more closely
				1209	// matched the incorrect use of the functions.
				1210	//
				1211	#if 0
				1212	{
				1213	UErrorCode status = U_ZERO_ERROR;
				1214	UChar ucharString[20];
				1215	RegexMatcher m(".", 0, status);
				1216	m.reset(ucharString); // should not compile.
				1217
				1218	RegexPattern *p = RegexPattern::compile(".", 0, status);
				1219	RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
				1220
				1221	RegexMatcher m3(".", ucharString, 0, status); // Should not compile
				1222	}
				1223	#endif
				1224
				1225	//
				1226	// Time Outs.
				1227	// Note: These tests will need to be changed when the regexp engine is
				1228	// able to detect and cut short the exponential time behavior on
				1229	// this type of match.
				1230	//
				1231	{
				1232	UErrorCode status = U_ZERO_ERROR;
				1233	// Enough 'a's in the string to cause the match to time out.
				1234	// (Each on additional 'a' doubles the time)
				1235	UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
				1236	RegexMatcher matcher("(a+)+b", testString, 0, status);
				1237	REGEX_CHECK_STATUS;
				1238	REGEX_ASSERT(matcher.getTimeLimit() == 0);
				1239	matcher.setTimeLimit(100, status);
				1240	REGEX_ASSERT(matcher.getTimeLimit() == 100);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1241	REGEX_ASSERT(matcher.lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1242	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
				1243	}
				1244	{
				1245	UErrorCode status = U_ZERO_ERROR;
				1246	// Few enough 'a's to slip in under the time limit.
				1247	UnicodeString testString("aaaaaaaaaaaaaaaaaa");
				1248	RegexMatcher matcher("(a+)+b", testString, 0, status);
				1249	REGEX_CHECK_STATUS;
				1250	matcher.setTimeLimit(100, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1251	REGEX_ASSERT(matcher.lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1252	REGEX_CHECK_STATUS;
				1253	}
				1254
				1255	//
				1256	// Stack Limits
				1257	//
				1258	{
				1259	UErrorCode status = U_ZERO_ERROR;
				1260	UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
				1261
				1262	// Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
				1263	// of the '+', and makes the stack frames larger.
				1264	RegexMatcher matcher("(A)+A$", testString, 0, status);
				1265
				1266	// With the default stack, this match should fail to run
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1267	REGEX_ASSERT(matcher.lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1268	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
				1269
				1270	// With unlimited stack, it should run
				1271	status = U_ZERO_ERROR;
				1272	matcher.setStackLimit(0, status);
				1273	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1274	REGEX_ASSERT(matcher.lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1275	REGEX_CHECK_STATUS;
				1276	REGEX_ASSERT(matcher.getStackLimit() == 0);
				1277
				1278	// With a limited stack, it the match should fail
				1279	status = U_ZERO_ERROR;
				1280	matcher.setStackLimit(10000, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1281	REGEX_ASSERT(matcher.lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1282	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
				1283	REGEX_ASSERT(matcher.getStackLimit() == 10000);
				1284	}
				1285
				1286	// A pattern that doesn't save state should work with
				1287	// a minimal sized stack
				1288	{
				1289	UErrorCode status = U_ZERO_ERROR;
				1290	UnicodeString testString = "abc";
				1291	RegexMatcher matcher("abc", testString, 0, status);
				1292	REGEX_CHECK_STATUS;
				1293	matcher.setStackLimit(30, status);
				1294	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1295	REGEX_ASSERT(matcher.matches(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1296	REGEX_CHECK_STATUS;
				1297	REGEX_ASSERT(matcher.getStackLimit() == 30);
				1298
				1299	// Negative stack sizes should fail
				1300	status = U_ZERO_ERROR;
				1301	matcher.setStackLimit(1000, status);
				1302	REGEX_CHECK_STATUS;
				1303	matcher.setStackLimit(-1, status);
				1304	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
				1305	REGEX_ASSERT(matcher.getStackLimit() == 1000);
				1306	}
				1307
				1308
				1309	}
				1310
				1311
				1312
				1313
				1314
				1315
				1316	//---------------------------------------------------------------------------
				1317	//
				1318	// API_Replace API test for class RegexMatcher, testing the
				1319	// Replace family of functions.
				1320	//
				1321	//---------------------------------------------------------------------------
				1322	void RegexTest::API_Replace() {
				1323	//
				1324	// Replace
				1325	//
				1326	int32_t flags=0;
				1327	UParseError pe;
				1328	UErrorCode status=U_ZERO_ERROR;
				1329
				1330	UnicodeString re("abc");
				1331	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
				1332	REGEX_CHECK_STATUS;
				1333	UnicodeString data = ".abc..abc...abc..";
				1334	// 012345678901234567
				1335	RegexMatcher *matcher = pat->matcher(data, status);
				1336
				1337	//
				1338	// Plain vanilla matches.
				1339	//
				1340	UnicodeString dest;
				1341	dest = matcher->replaceFirst("yz", status);
				1342	REGEX_CHECK_STATUS;
				1343	REGEX_ASSERT(dest == ".yz..abc...abc..");
				1344
				1345	dest = matcher->replaceAll("yz", status);
				1346	REGEX_CHECK_STATUS;
				1347	REGEX_ASSERT(dest == ".yz..yz...yz..");
				1348
				1349	//
				1350	// Plain vanilla non-matches.
				1351	//
				1352	UnicodeString d2 = ".abx..abx...abx..";
				1353	matcher->reset(d2);
				1354	dest = matcher->replaceFirst("yz", status);
				1355	REGEX_CHECK_STATUS;
				1356	REGEX_ASSERT(dest == ".abx..abx...abx..");
				1357
				1358	dest = matcher->replaceAll("yz", status);
				1359	REGEX_CHECK_STATUS;
				1360	REGEX_ASSERT(dest == ".abx..abx...abx..");
				1361
				1362	//
				1363	// Empty source string
				1364	//
				1365	UnicodeString d3 = "";
				1366	matcher->reset(d3);
				1367	dest = matcher->replaceFirst("yz", status);
				1368	REGEX_CHECK_STATUS;
				1369	REGEX_ASSERT(dest == "");
				1370
				1371	dest = matcher->replaceAll("yz", status);
				1372	REGEX_CHECK_STATUS;
				1373	REGEX_ASSERT(dest == "");
				1374
				1375	//
				1376	// Empty substitution string
				1377	//
				1378	matcher->reset(data); // ".abc..abc...abc.."
				1379	dest = matcher->replaceFirst("", status);
				1380	REGEX_CHECK_STATUS;
				1381	REGEX_ASSERT(dest == "...abc...abc..");
				1382
				1383	dest = matcher->replaceAll("", status);
				1384	REGEX_CHECK_STATUS;
				1385	REGEX_ASSERT(dest == "........");
				1386
				1387	//
				1388	// match whole string
				1389	//
				1390	UnicodeString d4 = "abc";
				1391	matcher->reset(d4);
				1392	dest = matcher->replaceFirst("xyz", status);
				1393	REGEX_CHECK_STATUS;
				1394	REGEX_ASSERT(dest == "xyz");
				1395
				1396	dest = matcher->replaceAll("xyz", status);
				1397	REGEX_CHECK_STATUS;
				1398	REGEX_ASSERT(dest == "xyz");
				1399
				1400	//
				1401	// Capture Group, simple case
				1402	//
				1403	UnicodeString re2("a(..)");
				1404	RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
				1405	REGEX_CHECK_STATUS;
				1406	UnicodeString d5 = "abcdefg";
				1407	RegexMatcher *matcher2 = pat2->matcher(d5, status);
				1408	REGEX_CHECK_STATUS;
				1409	dest = matcher2->replaceFirst("$1$1", status);
				1410	REGEX_CHECK_STATUS;
				1411	REGEX_ASSERT(dest == "bcbcdefg");
				1412
				1413	dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
				1414	REGEX_CHECK_STATUS;
				1415	REGEX_ASSERT(dest == "The value of $1 is bc.defg");
				1416
				1417	dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
				1418	REGEX_ASSERT(U_FAILURE(status));
				1419	status = U_ZERO_ERROR;
				1420
				1421	UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
				1422	replacement = replacement.unescape();
				1423	dest = matcher2->replaceFirst(replacement, status);
				1424	REGEX_CHECK_STATUS;
				1425	REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
				1426
				1427	REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
				1428
				1429
				1430	//
				1431	// Replacement String with \u hex escapes
				1432	//
				1433	{
				1434	UnicodeString src = "abc 1 abc 2 abc 3";
				1435	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
				1436	matcher->reset(src);
				1437	UnicodeString result = matcher->replaceAll(substitute, status);
				1438	REGEX_CHECK_STATUS;
				1439	REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
				1440	}
				1441	{
				1442	UnicodeString src = "abc !";
				1443	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
				1444	matcher->reset(src);
				1445	UnicodeString result = matcher->replaceAll(substitute, status);
				1446	REGEX_CHECK_STATUS;
				1447	UnicodeString expected = UnicodeString("--");
				1448	expected.append((UChar32)0x10000);
				1449	expected.append("-- !");
				1450	REGEX_ASSERT(result == expected);
				1451	}
				1452	// TODO: need more through testing of capture substitutions.
				1453
				1454	// Bug 4057
				1455	//
				1456	{
				1457	status = U_ZERO_ERROR;
				1458	UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
				1459	RegexMatcher m("ss(.*?)ee", 0, status);
				1460	REGEX_CHECK_STATUS;
				1461	UnicodeString result;
				1462
				1463	// Multiple finds do NOT bump up the previous appendReplacement position.
				1464	m.reset(s);
				1465	m.find();
				1466	m.find();
				1467	m.appendReplacement(result, "ooh", status);
				1468	REGEX_CHECK_STATUS;
				1469	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
				1470
				1471	// After a reset into the interior of a string, appendReplacemnt still starts at beginning.
				1472	status = U_ZERO_ERROR;
				1473	result.truncate(0);
				1474	m.reset(10, status);
				1475	m.find();
				1476	m.find();
				1477	m.appendReplacement(result, "ooh", status);
				1478	REGEX_CHECK_STATUS;
				1479	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
				1480
				1481	// find() at interior of string, appendReplacemnt still starts at beginning.
				1482	status = U_ZERO_ERROR;
				1483	result.truncate(0);
				1484	m.reset();
				1485	m.find(10, status);
				1486	m.find();
				1487	m.appendReplacement(result, "ooh", status);
				1488	REGEX_CHECK_STATUS;
				1489	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
				1490
				1491	m.appendTail(result);
				1492	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
				1493
				1494	}
				1495
				1496	delete matcher2;
				1497	delete pat2;
				1498	delete matcher;
				1499	delete pat;
				1500	}
				1501
				1502
				1503	//---------------------------------------------------------------------------
				1504	//
				1505	// API_Pattern Test that the API for class RegexPattern is
				1506	// present and nominally working.
				1507	//
				1508	//---------------------------------------------------------------------------
				1509	void RegexTest::API_Pattern() {
				1510	RegexPattern pata; // Test default constructor to not crash.
				1511	RegexPattern patb;
				1512
				1513	REGEX_ASSERT(pata == patb);
				1514	REGEX_ASSERT(pata == pata);
				1515
				1516	UnicodeString re1("abc[a-l][m-z]");
				1517	UnicodeString re2("def");
				1518	UErrorCode status = U_ZERO_ERROR;
				1519	UParseError pe;
				1520
				1521	RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
				1522	RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
				1523	REGEX_CHECK_STATUS;
				1524	REGEX_ASSERT(pat1 == pat1);
				1525	REGEX_ASSERT(*pat1 != pata);
				1526
				1527	// Assign
				1528	patb = *pat1;
				1529	REGEX_ASSERT(patb == *pat1);
				1530
				1531	// Copy Construct
				1532	RegexPattern patc(*pat1);
				1533	REGEX_ASSERT(patc == *pat1);
				1534	REGEX_ASSERT(patb == patc);
				1535	REGEX_ASSERT(pat1 != pat2);
				1536	patb = *pat2;
				1537	REGEX_ASSERT(patb != patc);
				1538	REGEX_ASSERT(patb == *pat2);
				1539
				1540	// Compile with no flags.
				1541	RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
				1542	REGEX_ASSERT(pat1a == pat1);
				1543
				1544	REGEX_ASSERT(pat1a->flags() == 0);
				1545
				1546	// Compile with different flags should be not equal
				1547	RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
				1548	REGEX_CHECK_STATUS;
				1549
				1550	REGEX_ASSERT(pat1b != pat1a);
				1551	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
				1552	REGEX_ASSERT(pat1a->flags() == 0);
				1553	delete pat1b;
				1554
				1555	// clone
				1556	RegexPattern *pat1c = pat1->clone();
				1557	REGEX_ASSERT(pat1c == pat1);
				1558	REGEX_ASSERT(pat1c != pat2);
				1559
				1560	delete pat1c;
				1561	delete pat1a;
				1562	delete pat1;
				1563	delete pat2;
				1564
				1565
				1566	//
				1567	// Verify that a matcher created from a cloned pattern works.
				1568	// (Jitterbug 3423)
				1569	//
				1570	{
				1571	UErrorCode status = U_ZERO_ERROR;
				1572	RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
				1573	RegexPattern *pClone = pSource->clone();
				1574	delete pSource;
				1575	RegexMatcher *mFromClone = pClone->matcher(status);
				1576	REGEX_CHECK_STATUS;
				1577	UnicodeString s = "Hello World";
				1578	mFromClone->reset(s);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1579	REGEX_ASSERT(mFromClone->find() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1580	REGEX_ASSERT(mFromClone->group(status) == "Hello");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1581	REGEX_ASSERT(mFromClone->find() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1582	REGEX_ASSERT(mFromClone->group(status) == "World");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1583	REGEX_ASSERT(mFromClone->find() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1584	delete mFromClone;
				1585	delete pClone;
				1586	}
				1587
				1588	//
				1589	// matches convenience API
				1590	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1591	REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1592	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1593	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1594	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1595	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1596	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1597	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1598	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1599	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1600	REGEX_CHECK_STATUS;
				1601	status = U_INDEX_OUTOFBOUNDS_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1602	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1603	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				1604
				1605
				1606	//
				1607	// Split()
				1608	//
				1609	status = U_ZERO_ERROR;
				1610	pat1 = RegexPattern::compile(" +", pe, status);
				1611	REGEX_CHECK_STATUS;
				1612	UnicodeString fields[10];
				1613
				1614	int32_t n;
				1615	n = pat1->split("Now is the time", fields, 10, status);
				1616	REGEX_CHECK_STATUS;
				1617	REGEX_ASSERT(n==4);
				1618	REGEX_ASSERT(fields[0]=="Now");
				1619	REGEX_ASSERT(fields[1]=="is");
				1620	REGEX_ASSERT(fields[2]=="the");
				1621	REGEX_ASSERT(fields[3]=="time");
				1622	REGEX_ASSERT(fields[4]=="");
				1623
				1624	n = pat1->split("Now is the time", fields, 2, status);
				1625	REGEX_CHECK_STATUS;
				1626	REGEX_ASSERT(n==2);
				1627	REGEX_ASSERT(fields[0]=="Now");
				1628	REGEX_ASSERT(fields[1]=="is the time");
				1629	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
				1630
				1631	fields[1] = "*";
				1632	status = U_ZERO_ERROR;
				1633	n = pat1->split("Now is the time", fields, 1, status);
				1634	REGEX_CHECK_STATUS;
				1635	REGEX_ASSERT(n==1);
				1636	REGEX_ASSERT(fields[0]=="Now is the time");
				1637	REGEX_ASSERT(fields[1]=="*");
				1638	status = U_ZERO_ERROR;
				1639
				1640	n = pat1->split(" Now is the time ", fields, 10, status);
				1641	REGEX_CHECK_STATUS;
				1642	REGEX_ASSERT(n==6);
				1643	REGEX_ASSERT(fields[0]=="");
				1644	REGEX_ASSERT(fields[1]=="Now");
				1645	REGEX_ASSERT(fields[2]=="is");
				1646	REGEX_ASSERT(fields[3]=="the");
				1647	REGEX_ASSERT(fields[4]=="time");
				1648	REGEX_ASSERT(fields[5]=="");
				1649
				1650	n = pat1->split(" ", fields, 10, status);
				1651	REGEX_CHECK_STATUS;
				1652	REGEX_ASSERT(n==2);
				1653	REGEX_ASSERT(fields[0]=="");
				1654	REGEX_ASSERT(fields[1]=="");
				1655
				1656	fields[0] = "foo";
				1657	n = pat1->split("", fields, 10, status);
				1658	REGEX_CHECK_STATUS;
				1659	REGEX_ASSERT(n==0);
				1660	REGEX_ASSERT(fields[0]=="foo");
				1661
				1662	delete pat1;
				1663
				1664	// split, with a pattern with (capture)
				1665	pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
				1666	REGEX_CHECK_STATUS;
				1667
				1668	status = U_ZERO_ERROR;
				1669	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
				1670	REGEX_CHECK_STATUS;
				1671	REGEX_ASSERT(n==7);
				1672	REGEX_ASSERT(fields[0]=="");
				1673	REGEX_ASSERT(fields[1]=="a");
				1674	REGEX_ASSERT(fields[2]=="Now is ");
				1675	REGEX_ASSERT(fields[3]=="b");
				1676	REGEX_ASSERT(fields[4]=="the time");
				1677	REGEX_ASSERT(fields[5]=="c");
				1678	REGEX_ASSERT(fields[6]=="");
				1679	REGEX_ASSERT(status==U_ZERO_ERROR);
				1680
				1681	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
				1682	REGEX_CHECK_STATUS;
				1683	REGEX_ASSERT(n==7);
				1684	REGEX_ASSERT(fields[0]==" ");
				1685	REGEX_ASSERT(fields[1]=="a");
				1686	REGEX_ASSERT(fields[2]=="Now is ");
				1687	REGEX_ASSERT(fields[3]=="b");
				1688	REGEX_ASSERT(fields[4]=="the time");
				1689	REGEX_ASSERT(fields[5]=="c");
				1690	REGEX_ASSERT(fields[6]=="");
				1691
				1692	status = U_ZERO_ERROR;
				1693	fields[6] = "foo";
				1694	n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
				1695	REGEX_CHECK_STATUS;
				1696	REGEX_ASSERT(n==6);
				1697	REGEX_ASSERT(fields[0]==" ");
				1698	REGEX_ASSERT(fields[1]=="a");
				1699	REGEX_ASSERT(fields[2]=="Now is ");
				1700	REGEX_ASSERT(fields[3]=="b");
				1701	REGEX_ASSERT(fields[4]=="the time");
				1702	REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
				1703	REGEX_ASSERT(fields[6]=="foo");
				1704
				1705	status = U_ZERO_ERROR;
				1706	fields[5] = "foo";
				1707	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
				1708	REGEX_CHECK_STATUS;
				1709	REGEX_ASSERT(n==5);
				1710	REGEX_ASSERT(fields[0]==" ");
				1711	REGEX_ASSERT(fields[1]=="a");
				1712	REGEX_ASSERT(fields[2]=="Now is ");
				1713	REGEX_ASSERT(fields[3]=="b");
				1714	REGEX_ASSERT(fields[4]=="the time<c>");
				1715	REGEX_ASSERT(fields[5]=="foo");
				1716
				1717	status = U_ZERO_ERROR;
				1718	fields[5] = "foo";
				1719	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
				1720	REGEX_CHECK_STATUS;
				1721	REGEX_ASSERT(n==5);
				1722	REGEX_ASSERT(fields[0]==" ");
				1723	REGEX_ASSERT(fields[1]=="a");
				1724	REGEX_ASSERT(fields[2]=="Now is ");
				1725	REGEX_ASSERT(fields[3]=="b");
				1726	REGEX_ASSERT(fields[4]=="the time");
				1727	REGEX_ASSERT(fields[5]=="foo");
				1728
				1729	status = U_ZERO_ERROR;
				1730	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
				1731	REGEX_CHECK_STATUS;
				1732	REGEX_ASSERT(n==4);
				1733	REGEX_ASSERT(fields[0]==" ");
				1734	REGEX_ASSERT(fields[1]=="a");
				1735	REGEX_ASSERT(fields[2]=="Now is ");
				1736	REGEX_ASSERT(fields[3]=="the time<c>");
				1737	status = U_ZERO_ERROR;
				1738	delete pat1;
				1739
				1740	pat1 = RegexPattern::compile("([-,])", pe, status);
				1741	REGEX_CHECK_STATUS;
				1742	n = pat1->split("1-10,20", fields, 10, status);
				1743	REGEX_CHECK_STATUS;
				1744	REGEX_ASSERT(n==5);
				1745	REGEX_ASSERT(fields[0]=="1");
				1746	REGEX_ASSERT(fields[1]=="-");
				1747	REGEX_ASSERT(fields[2]=="10");
				1748	REGEX_ASSERT(fields[3]==",");
				1749	REGEX_ASSERT(fields[4]=="20");
				1750	delete pat1;
				1751
				1752	// Test split of string with empty trailing fields
				1753	pat1 = RegexPattern::compile(",", pe, status);
				1754	REGEX_CHECK_STATUS;
				1755	n = pat1->split("a,b,c,", fields, 10, status);
				1756	REGEX_CHECK_STATUS;
				1757	REGEX_ASSERT(n==4);
				1758	REGEX_ASSERT(fields[0]=="a");
				1759	REGEX_ASSERT(fields[1]=="b");
				1760	REGEX_ASSERT(fields[2]=="c");
				1761	REGEX_ASSERT(fields[3]=="");
				1762
				1763	n = pat1->split("a,,,", fields, 10, status);
				1764	REGEX_CHECK_STATUS;
				1765	REGEX_ASSERT(n==4);
				1766	REGEX_ASSERT(fields[0]=="a");
				1767	REGEX_ASSERT(fields[1]=="");
				1768	REGEX_ASSERT(fields[2]=="");
				1769	REGEX_ASSERT(fields[3]=="");
				1770	delete pat1;
				1771
				1772	// Split Separator with zero length match.
				1773	pat1 = RegexPattern::compile(":?", pe, status);
				1774	REGEX_CHECK_STATUS;
				1775	n = pat1->split("abc", fields, 10, status);
				1776	REGEX_CHECK_STATUS;
				1777	REGEX_ASSERT(n==5);
				1778	REGEX_ASSERT(fields[0]=="");
				1779	REGEX_ASSERT(fields[1]=="a");
				1780	REGEX_ASSERT(fields[2]=="b");
				1781	REGEX_ASSERT(fields[3]=="c");
				1782	REGEX_ASSERT(fields[4]=="");
				1783
				1784	delete pat1;
				1785
				1786	//
				1787	// RegexPattern::pattern()
				1788	//
				1789	pat1 = new RegexPattern();
				1790	REGEX_ASSERT(pat1->pattern() == "");
				1791	delete pat1;
				1792
				1793	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
				1794	REGEX_CHECK_STATUS;
				1795	REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
				1796	delete pat1;
				1797
				1798
				1799	//
				1800	// classID functions
				1801	//
				1802	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
				1803	REGEX_CHECK_STATUS;
				1804	REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
				1805	REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
				1806	UnicodeString Hello("Hello, world.");
				1807	RegexMatcher *m = pat1->matcher(Hello, status);
				1808	REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
				1809	REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
				1810	REGEX_ASSERT(m->getDynamicClassID() != NULL);
				1811	delete m;
				1812	delete pat1;
				1813
				1814	}
				1815
				1816	//---------------------------------------------------------------------------
				1817	//
				1818	// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
				1819	// is present and working, but excluding functions
				1820	// implementing replace operations.
				1821	//
				1822	//---------------------------------------------------------------------------
				1823	void RegexTest::API_Match_UTF8() {
				1824	UParseError pe;
				1825	UErrorCode status=U_ZERO_ERROR;
				1826	int32_t flags = 0;
				1827
				1828	//
				1829	// Debug - slide failing test cases early
				1830	//
				1831	#if 0
				1832	{
				1833	}
				1834	return;
				1835	#endif
				1836
				1837	//
				1838	// Simple pattern compilation
				1839	//
				1840	{
				1841	UText re = UTEXT_INITIALIZER;
				1842	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
				1843	REGEX_VERBOSE_TEXT(&re);
				1844	RegexPattern *pat2;
				1845	pat2 = RegexPattern::compile(&re, flags, pe, status);
				1846	REGEX_CHECK_STATUS;
				1847
				1848	UText input1 = UTEXT_INITIALIZER;
				1849	UText input2 = UTEXT_INITIALIZER;
				1850	UText empty = UTEXT_INITIALIZER;
				1851	regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
				1852	REGEX_VERBOSE_TEXT(&input1);
				1853	regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
				1854	REGEX_VERBOSE_TEXT(&input2);
				1855	utext_openUChars(&empty, NULL, 0, &status);
				1856
				1857	int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
				1858	int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
				1859
				1860
				1861	//
				1862	// Matcher creation and reset.
				1863	//
				1864	RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
				1865	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1866	REGEX_ASSERT(m1->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1867	const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
				1868	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
				1869	m1->reset(&input2);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1870	REGEX_ASSERT(m1->lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1871	const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
				1872	REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
				1873	m1->reset(&input1);
				1874	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1875	REGEX_ASSERT(m1->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1876	m1->reset(&empty);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1877	REGEX_ASSERT(m1->lookingAt(status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1878	REGEX_ASSERT(utext_nativeLength(&empty) == 0);
				1879
				1880	//
				1881	// reset(pos, status)
				1882	//
				1883	m1->reset(&input1);
				1884	m1->reset(4, status);
				1885	REGEX_CHECK_STATUS;
				1886	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1887	REGEX_ASSERT(m1->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1888
				1889	m1->reset(-1, status);
				1890	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				1891	status = U_ZERO_ERROR;
				1892
				1893	m1->reset(0, status);
				1894	REGEX_CHECK_STATUS;
				1895	status = U_ZERO_ERROR;
				1896
				1897	m1->reset(input1Len-1, status);
				1898	REGEX_CHECK_STATUS;
				1899	status = U_ZERO_ERROR;
				1900
				1901	m1->reset(input1Len, status);
				1902	REGEX_CHECK_STATUS;
				1903	status = U_ZERO_ERROR;
				1904
				1905	m1->reset(input1Len+1, status);
				1906	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				1907	status = U_ZERO_ERROR;
				1908
				1909	//
				1910	// match(pos, status)
				1911	//
				1912	m1->reset(&input2);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1913	REGEX_ASSERT(m1->matches(4, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1914	m1->reset();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1915	REGEX_ASSERT(m1->matches(3, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1916	m1->reset();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1917	REGEX_ASSERT(m1->matches(5, status) == false);
				1918	REGEX_ASSERT(m1->matches(4, status) == true);
				1919	REGEX_ASSERT(m1->matches(-1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1920	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				1921
				1922	// Match() at end of string should fail, but should not
				1923	// be an error.
				1924	status = U_ZERO_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1925	REGEX_ASSERT(m1->matches(input2Len, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1926	REGEX_CHECK_STATUS;
				1927
				1928	// Match beyond end of string should fail with an error.
				1929	status = U_ZERO_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1930	REGEX_ASSERT(m1->matches(input2Len+1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1931	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				1932
				1933	// Successful match at end of string.
				1934	{
				1935	status = U_ZERO_ERROR;
				1936	RegexMatcher m("A?", 0, status); // will match zero length string.
				1937	REGEX_CHECK_STATUS;
				1938	m.reset(&input1);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1939	REGEX_ASSERT(m.matches(input1Len, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1940	REGEX_CHECK_STATUS;
				1941	m.reset(&empty);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1942	REGEX_ASSERT(m.matches(0, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1943	REGEX_CHECK_STATUS;
				1944	}
				1945
				1946
				1947	//
				1948	// lookingAt(pos, status)
				1949	//
				1950	status = U_ZERO_ERROR;
				1951	m1->reset(&input2); // "not abc"
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1952	REGEX_ASSERT(m1->lookingAt(4, status) == true);
				1953	REGEX_ASSERT(m1->lookingAt(5, status) == false);
				1954	REGEX_ASSERT(m1->lookingAt(3, status) == false);
				1955	REGEX_ASSERT(m1->lookingAt(4, status) == true);
				1956	REGEX_ASSERT(m1->lookingAt(-1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1957	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				1958	status = U_ZERO_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1959	REGEX_ASSERT(m1->lookingAt(input2Len, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1960	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1961	REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1962	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				1963
				1964	delete m1;
				1965	delete pat2;
				1966
				1967	utext_close(&re);
				1968	utext_close(&input1);
				1969	utext_close(&input2);
				1970	utext_close(&empty);
				1971	}
				1972
				1973
				1974	//
				1975	// Capture Group.
				1976	// RegexMatcher::start();
				1977	// RegexMatcher::end();
				1978	// RegexMatcher::groupCount();
				1979	//
				1980	{
				1981	int32_t flags=0;
				1982	UParseError pe;
				1983	UErrorCode status=U_ZERO_ERROR;
				1984	UText re=UTEXT_INITIALIZER;
				1985	const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.) /
				1986	utext_openUTF8(&re, str_01234567_pat, -1, &status);
				1987
				1988	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
				1989	REGEX_CHECK_STATUS;
				1990
				1991	UText input = UTEXT_INITIALIZER;
				1992	const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
				1993	utext_openUTF8(&input, str_0123456789, -1, &status);
				1994
				1995	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
				1996	REGEX_CHECK_STATUS;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1997	REGEX_ASSERT(matcher->lookingAt(status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1998	static const int32_t matchStarts[] = {0, 2, 4, 8};
				1999	static const int32_t matchEnds[] = {10, 8, 6, 10};
				2000	int32_t i;
				2001	for (i=0; i<4; i++) {
				2002	int32_t actualStart = matcher->start(i, status);
				2003	REGEX_CHECK_STATUS;
				2004	if (actualStart != matchStarts[i]) {
				2005	errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
				2006	__FILE__, __LINE__, i, matchStarts[i], actualStart);
				2007	}
				2008	int32_t actualEnd = matcher->end(i, status);
				2009	REGEX_CHECK_STATUS;
				2010	if (actualEnd != matchEnds[i]) {
				2011	errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
				2012	__FILE__, __LINE__, i, matchEnds[i], actualEnd);
				2013	}
				2014	}
				2015
				2016	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
				2017	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
				2018
				2019	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
				2020	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
				2021	matcher->reset();
				2022	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
				2023
				2024	matcher->lookingAt(status);
				2025
				2026	UnicodeString dest;
				2027	UText destText = UTEXT_INITIALIZER;
				2028	utext_openUnicodeString(&destText, &dest, &status);
				2029	UText *result;
				2030	//const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
				2031	// Test shallow-clone API
				2032	int64_t group_len;
				2033	result = matcher->group((UText *)NULL, group_len, status);
				2034	REGEX_CHECK_STATUS;
				2035	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
				2036	utext_close(result);
				2037	result = matcher->group(0, &destText, group_len, status);
				2038	REGEX_CHECK_STATUS;
				2039	REGEX_ASSERT(result == &destText);
				2040	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
				2041	// destText is now immutable, reopen it
				2042	utext_close(&destText);
				2043	utext_openUnicodeString(&destText, &dest, &status);
				2044
				2045	int64_t length;
				2046	result = matcher->group(0, NULL, length, status);
				2047	REGEX_CHECK_STATUS;
				2048	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
				2049	utext_close(result);
				2050	result = matcher->group(0, &destText, length, status);
				2051	REGEX_CHECK_STATUS;
				2052	REGEX_ASSERT(result == &destText);
				2053	REGEX_ASSERT(utext_getNativeIndex(result) == 0);
				2054	REGEX_ASSERT(length == 10);
				2055	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
				2056
				2057	// Capture Group 1 == "234567"
				2058	result = matcher->group(1, NULL, length, status);
				2059	REGEX_CHECK_STATUS;
				2060	REGEX_ASSERT(utext_getNativeIndex(result) == 2);
				2061	REGEX_ASSERT(length == 6);
				2062	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
				2063	utext_close(result);
				2064
				2065	result = matcher->group(1, &destText, length, status);
				2066	REGEX_CHECK_STATUS;
				2067	REGEX_ASSERT(result == &destText);
				2068	REGEX_ASSERT(utext_getNativeIndex(result) == 2);
				2069	REGEX_ASSERT(length == 6);
				2070	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
				2071	utext_close(result);
				2072
				2073	// Capture Group 2 == "45"
				2074	result = matcher->group(2, NULL, length, status);
				2075	REGEX_CHECK_STATUS;
				2076	REGEX_ASSERT(utext_getNativeIndex(result) == 4);
				2077	REGEX_ASSERT(length == 2);
				2078	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
				2079	utext_close(result);
				2080
				2081	result = matcher->group(2, &destText, length, status);
				2082	REGEX_CHECK_STATUS;
				2083	REGEX_ASSERT(result == &destText);
				2084	REGEX_ASSERT(utext_getNativeIndex(result) == 4);
				2085	REGEX_ASSERT(length == 2);
				2086	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
				2087	utext_close(result);
				2088
				2089	// Capture Group 3 == "89"
				2090	result = matcher->group(3, NULL, length, status);
				2091	REGEX_CHECK_STATUS;
				2092	REGEX_ASSERT(utext_getNativeIndex(result) == 8);
				2093	REGEX_ASSERT(length == 2);
				2094	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
				2095	utext_close(result);
				2096
				2097	result = matcher->group(3, &destText, length, status);
				2098	REGEX_CHECK_STATUS;
				2099	REGEX_ASSERT(result == &destText);
				2100	REGEX_ASSERT(utext_getNativeIndex(result) == 8);
				2101	REGEX_ASSERT(length == 2);
				2102	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
				2103	utext_close(result);
				2104
				2105	// Capture Group number out of range.
				2106	status = U_ZERO_ERROR;
				2107	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
				2108	status = U_ZERO_ERROR;
				2109	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
				2110	status = U_ZERO_ERROR;
				2111	matcher->reset();
				2112	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
				2113
				2114	delete matcher;
				2115	delete pat;
				2116
				2117	utext_close(&destText);
				2118	utext_close(&input);
				2119	utext_close(&re);
				2120	}
				2121
				2122	//
				2123	// find
				2124	//
				2125	{
				2126	int32_t flags=0;
				2127	UParseError pe;
				2128	UErrorCode status=U_ZERO_ERROR;
				2129	UText re=UTEXT_INITIALIZER;
				2130	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
				2131	utext_openUTF8(&re, str_abc, -1, &status);
				2132
				2133	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
				2134	REGEX_CHECK_STATUS;
				2135	UText input = UTEXT_INITIALIZER;
				2136	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
				2137	utext_openUTF8(&input, str_abcabcabc, -1, &status);
				2138	// 012345678901234567
				2139
				2140	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
				2141	REGEX_CHECK_STATUS;
				2142	REGEX_ASSERT(matcher->find());
				2143	REGEX_ASSERT(matcher->start(status) == 1);
				2144	REGEX_ASSERT(matcher->find());
				2145	REGEX_ASSERT(matcher->start(status) == 6);
				2146	REGEX_ASSERT(matcher->find());
				2147	REGEX_ASSERT(matcher->start(status) == 12);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2148	REGEX_ASSERT(matcher->find() == false);
				2149	REGEX_ASSERT(matcher->find() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2150
				2151	matcher->reset();
				2152	REGEX_ASSERT(matcher->find());
				2153	REGEX_ASSERT(matcher->start(status) == 1);
				2154
				2155	REGEX_ASSERT(matcher->find(0, status));
				2156	REGEX_ASSERT(matcher->start(status) == 1);
				2157	REGEX_ASSERT(matcher->find(1, status));
				2158	REGEX_ASSERT(matcher->start(status) == 1);
				2159	REGEX_ASSERT(matcher->find(2, status));
				2160	REGEX_ASSERT(matcher->start(status) == 6);
				2161	REGEX_ASSERT(matcher->find(12, status));
				2162	REGEX_ASSERT(matcher->start(status) == 12);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2163	REGEX_ASSERT(matcher->find(13, status) == false);
				2164	REGEX_ASSERT(matcher->find(16, status) == false);
				2165	REGEX_ASSERT(matcher->find(17, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2166	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
				2167
				2168	status = U_ZERO_ERROR;
				2169	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
				2170	status = U_ZERO_ERROR;
				2171	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
				2172
				2173	REGEX_ASSERT(matcher->groupCount() == 0);
				2174
				2175	delete matcher;
				2176	delete pat;
				2177
				2178	utext_close(&input);
				2179	utext_close(&re);
				2180	}
				2181
				2182
				2183	//
				2184	// find, with \G in pattern (true if at the end of a previous match).
				2185	//
				2186	{
				2187	int32_t flags=0;
				2188	UParseError pe;
				2189	UErrorCode status=U_ZERO_ERROR;
				2190	UText re=UTEXT_INITIALIZER;
				2191	const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .?(?:(\\Gabc)\|(abc)) /
				2192	utext_openUTF8(&re, str_Gabcabc, -1, &status);
				2193
				2194	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
				2195
				2196	REGEX_CHECK_STATUS;
				2197	UText input = UTEXT_INITIALIZER;
				2198	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
				2199	utext_openUTF8(&input, str_abcabcabc, -1, &status);
				2200	// 012345678901234567
				2201
				2202	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
				2203	REGEX_CHECK_STATUS;
				2204	REGEX_ASSERT(matcher->find());
				2205	REGEX_ASSERT(matcher->start(status) == 0);
				2206	REGEX_ASSERT(matcher->start(1, status) == -1);
				2207	REGEX_ASSERT(matcher->start(2, status) == 1);
				2208
				2209	REGEX_ASSERT(matcher->find());
				2210	REGEX_ASSERT(matcher->start(status) == 4);
				2211	REGEX_ASSERT(matcher->start(1, status) == 4);
				2212	REGEX_ASSERT(matcher->start(2, status) == -1);
				2213	REGEX_CHECK_STATUS;
				2214
				2215	delete matcher;
				2216	delete pat;
				2217
				2218	utext_close(&input);
				2219	utext_close(&re);
				2220	}
				2221
				2222	//
				2223	// find with zero length matches, match position should bump ahead
				2224	// to prevent loops.
				2225	//
				2226	{
				2227	int32_t i;
				2228	UErrorCode status=U_ZERO_ERROR;
				2229	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
				2230	// using an always-true look-ahead.
				2231	REGEX_CHECK_STATUS;
				2232	UText s = UTEXT_INITIALIZER;
				2233	utext_openUTF8(&s, " ", -1, &status);
				2234	m.reset(&s);
				2235	for (i=0; ; i++) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2236	if (m.find() == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2237	break;
				2238	}
				2239	REGEX_ASSERT(m.start(status) == i);
				2240	REGEX_ASSERT(m.end(status) == i);
				2241	}
				2242	REGEX_ASSERT(i==5);
				2243
				2244	// Check that the bump goes over characters outside the BMP OK
				2245	// "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
				2246	unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
				2247	utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
				2248	m.reset(&s);
				2249	for (i=0; ; i+=4) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2250	if (m.find() == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2251	break;
				2252	}
				2253	REGEX_ASSERT(m.start(status) == i);
				2254	REGEX_ASSERT(m.end(status) == i);
				2255	}
				2256	REGEX_ASSERT(i==20);
				2257
				2258	utext_close(&s);
				2259	}
				2260	{
				2261	// find() loop breaking test.
				2262	// with pattern of /.?/, should see a series of one char matches, then a single
				2263	// match of zero length at the end of the input string.
				2264	int32_t i;
				2265	UErrorCode status=U_ZERO_ERROR;
				2266	RegexMatcher m(".?", 0, status);
				2267	REGEX_CHECK_STATUS;
				2268	UText s = UTEXT_INITIALIZER;
				2269	utext_openUTF8(&s, " ", -1, &status);
				2270	m.reset(&s);
				2271	for (i=0; ; i++) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2272	if (m.find() == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2273	break;
				2274	}
				2275	REGEX_ASSERT(m.start(status) == i);
				2276	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
				2277	}
				2278	REGEX_ASSERT(i==5);
				2279
				2280	utext_close(&s);
				2281	}
				2282
				2283
				2284	//
				2285	// Matchers with no input string behave as if they had an empty input string.
				2286	//
				2287
				2288	{
				2289	UErrorCode status = U_ZERO_ERROR;
				2290	RegexMatcher m(".?", 0, status);
				2291	REGEX_CHECK_STATUS;
				2292	REGEX_ASSERT(m.find());
				2293	REGEX_ASSERT(m.start(status) == 0);
				2294	REGEX_ASSERT(m.input() == "");
				2295	}
				2296	{
				2297	UErrorCode status = U_ZERO_ERROR;
				2298	RegexPattern *p = RegexPattern::compile(".", 0, status);
				2299	RegexMatcher *m = p->matcher(status);
				2300	REGEX_CHECK_STATUS;
				2301
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2302	REGEX_ASSERT(m->find() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2303	REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
				2304	delete m;
				2305	delete p;
				2306	}
				2307
				2308	//
				2309	// Regions
				2310	//
				2311	{
				2312	UErrorCode status = U_ZERO_ERROR;
				2313	UText testPattern = UTEXT_INITIALIZER;
				2314	UText testText = UTEXT_INITIALIZER;
				2315	regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
				2316	REGEX_VERBOSE_TEXT(&testPattern);
				2317	regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
				2318	REGEX_VERBOSE_TEXT(&testText);
				2319
				2320	RegexMatcher m(&testPattern, &testText, 0, status);
				2321	REGEX_CHECK_STATUS;
				2322	REGEX_ASSERT(m.regionStart() == 0);
				2323	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2324	REGEX_ASSERT(m.hasTransparentBounds() == false);
				2325	REGEX_ASSERT(m.hasAnchoringBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2326
				2327	m.region(2,4, status);
				2328	REGEX_CHECK_STATUS;
				2329	REGEX_ASSERT(m.matches(status));
				2330	REGEX_ASSERT(m.start(status)==2);
				2331	REGEX_ASSERT(m.end(status)==4);
				2332	REGEX_CHECK_STATUS;
				2333
				2334	m.reset();
				2335	REGEX_ASSERT(m.regionStart() == 0);
				2336	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
				2337
				2338	regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
				2339	REGEX_VERBOSE_TEXT(&testText);
				2340	m.reset(&testText);
				2341	REGEX_ASSERT(m.regionStart() == 0);
				2342	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
				2343
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2344	REGEX_ASSERT(m.hasAnchoringBounds() == true);
				2345	REGEX_ASSERT(&m == &m.useAnchoringBounds(false));
				2346	REGEX_ASSERT(m.hasAnchoringBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2347	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2348	REGEX_ASSERT(m.hasAnchoringBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2349
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2350	REGEX_ASSERT(&m == &m.useAnchoringBounds(true));
				2351	REGEX_ASSERT(m.hasAnchoringBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2352	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2353	REGEX_ASSERT(m.hasAnchoringBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2354
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2355	REGEX_ASSERT(m.hasTransparentBounds() == false);
				2356	REGEX_ASSERT(&m == &m.useTransparentBounds(true));
				2357	REGEX_ASSERT(m.hasTransparentBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2358	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2359	REGEX_ASSERT(m.hasTransparentBounds() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2360
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2361	REGEX_ASSERT(&m == &m.useTransparentBounds(false));
				2362	REGEX_ASSERT(m.hasTransparentBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2363	REGEX_ASSERT(&m == &m.reset());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2364	REGEX_ASSERT(m.hasTransparentBounds() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2365
				2366	utext_close(&testText);
				2367	utext_close(&testPattern);
				2368	}
				2369
				2370	//
				2371	// hitEnd() and requireEnd()
				2372	//
				2373	{
				2374	UErrorCode status = U_ZERO_ERROR;
				2375	UText testPattern = UTEXT_INITIALIZER;
				2376	UText testText = UTEXT_INITIALIZER;
				2377	const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
				2378	const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
				2379	utext_openUTF8(&testPattern, str_, -1, &status);
				2380	utext_openUTF8(&testText, str_aabb, -1, &status);
				2381
				2382	RegexMatcher m1(&testPattern, &testText, 0, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2383	REGEX_ASSERT(m1.lookingAt(status) == true);
				2384	REGEX_ASSERT(m1.hitEnd() == true);
				2385	REGEX_ASSERT(m1.requireEnd() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2386	REGEX_CHECK_STATUS;
				2387
				2388	status = U_ZERO_ERROR;
				2389	const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
				2390	utext_openUTF8(&testPattern, str_a, -1, &status);
				2391	RegexMatcher m2(&testPattern, &testText, 0, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2392	REGEX_ASSERT(m2.lookingAt(status) == true);
				2393	REGEX_ASSERT(m2.hitEnd() == false);
				2394	REGEX_ASSERT(m2.requireEnd() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2395	REGEX_CHECK_STATUS;
				2396
				2397	status = U_ZERO_ERROR;
				2398	const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .$ /
				2399	utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
				2400	RegexMatcher m3(&testPattern, &testText, 0, status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2401	REGEX_ASSERT(m3.lookingAt(status) == true);
				2402	REGEX_ASSERT(m3.hitEnd() == true);
				2403	REGEX_ASSERT(m3.requireEnd() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2404	REGEX_CHECK_STATUS;
				2405
				2406	utext_close(&testText);
				2407	utext_close(&testPattern);
				2408	}
				2409	}
				2410
				2411
				2412	//---------------------------------------------------------------------------
				2413	//
				2414	// API_Replace_UTF8 API test for class RegexMatcher, testing the
				2415	// Replace family of functions.
				2416	//
				2417	//---------------------------------------------------------------------------
				2418	void RegexTest::API_Replace_UTF8() {
				2419	//
				2420	// Replace
				2421	//
				2422	int32_t flags=0;
				2423	UParseError pe;
				2424	UErrorCode status=U_ZERO_ERROR;
				2425
				2426	UText re=UTEXT_INITIALIZER;
				2427	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
				2428	REGEX_VERBOSE_TEXT(&re);
				2429	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
				2430	REGEX_CHECK_STATUS;
				2431
				2432	char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
				2433	// 012345678901234567
				2434	UText dataText = UTEXT_INITIALIZER;
				2435	utext_openUTF8(&dataText, data, -1, &status);
				2436	REGEX_CHECK_STATUS;
				2437	REGEX_VERBOSE_TEXT(&dataText);
				2438	RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
				2439
				2440	//
				2441	// Plain vanilla matches.
				2442	//
				2443	UnicodeString dest;
				2444	UText destText = UTEXT_INITIALIZER;
				2445	utext_openUnicodeString(&destText, &dest, &status);
				2446	UText *result;
				2447
				2448	UText replText = UTEXT_INITIALIZER;
				2449
				2450	const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
				2451	utext_openUTF8(&replText, str_yz, -1, &status);
				2452	REGEX_VERBOSE_TEXT(&replText);
				2453	result = matcher->replaceFirst(&replText, NULL, status);
				2454	REGEX_CHECK_STATUS;
				2455	const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
				2456	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
				2457	utext_close(result);
				2458	result = matcher->replaceFirst(&replText, &destText, status);
				2459	REGEX_CHECK_STATUS;
				2460	REGEX_ASSERT(result == &destText);
				2461	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
				2462
				2463	result = matcher->replaceAll(&replText, NULL, status);
				2464	REGEX_CHECK_STATUS;
				2465	const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
				2466	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
				2467	utext_close(result);
				2468
				2469	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2470	result = matcher->replaceAll(&replText, &destText, status);
				2471	REGEX_CHECK_STATUS;
				2472	REGEX_ASSERT(result == &destText);
				2473	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
				2474
				2475	//
				2476	// Plain vanilla non-matches.
				2477	//
				2478	const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
				2479	utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
				2480	matcher->reset(&dataText);
				2481
				2482	result = matcher->replaceFirst(&replText, NULL, status);
				2483	REGEX_CHECK_STATUS;
				2484	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
				2485	utext_close(result);
				2486	result = matcher->replaceFirst(&replText, &destText, status);
				2487	REGEX_CHECK_STATUS;
				2488	REGEX_ASSERT(result == &destText);
				2489	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
				2490
				2491	result = matcher->replaceAll(&replText, NULL, status);
				2492	REGEX_CHECK_STATUS;
				2493	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
				2494	utext_close(result);
				2495	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2496	result = matcher->replaceAll(&replText, &destText, status);
				2497	REGEX_CHECK_STATUS;
				2498	REGEX_ASSERT(result == &destText);
				2499	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
				2500
				2501	//
				2502	// Empty source string
				2503	//
				2504	utext_openUTF8(&dataText, NULL, 0, &status);
				2505	matcher->reset(&dataText);
				2506
				2507	result = matcher->replaceFirst(&replText, NULL, status);
				2508	REGEX_CHECK_STATUS;
				2509	REGEX_ASSERT_UTEXT_UTF8("", result);
				2510	utext_close(result);
				2511	result = matcher->replaceFirst(&replText, &destText, status);
				2512	REGEX_CHECK_STATUS;
				2513	REGEX_ASSERT(result == &destText);
				2514	REGEX_ASSERT_UTEXT_UTF8("", result);
				2515
				2516	result = matcher->replaceAll(&replText, NULL, status);
				2517	REGEX_CHECK_STATUS;
				2518	REGEX_ASSERT_UTEXT_UTF8("", result);
				2519	utext_close(result);
				2520	result = matcher->replaceAll(&replText, &destText, status);
				2521	REGEX_CHECK_STATUS;
				2522	REGEX_ASSERT(result == &destText);
				2523	REGEX_ASSERT_UTEXT_UTF8("", result);
				2524
				2525	//
				2526	// Empty substitution string
				2527	//
				2528	utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
				2529	matcher->reset(&dataText);
				2530
				2531	utext_openUTF8(&replText, NULL, 0, &status);
				2532	result = matcher->replaceFirst(&replText, NULL, status);
				2533	REGEX_CHECK_STATUS;
				2534	const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
				2535	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
				2536	utext_close(result);
				2537	result = matcher->replaceFirst(&replText, &destText, status);
				2538	REGEX_CHECK_STATUS;
				2539	REGEX_ASSERT(result == &destText);
				2540	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
				2541
				2542	result = matcher->replaceAll(&replText, NULL, status);
				2543	REGEX_CHECK_STATUS;
				2544	const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
				2545	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
				2546	utext_close(result);
				2547	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2548	result = matcher->replaceAll(&replText, &destText, status);
				2549	REGEX_CHECK_STATUS;
				2550	REGEX_ASSERT(result == &destText);
				2551	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
				2552
				2553	//
				2554	// match whole string
				2555	//
				2556	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
				2557	utext_openUTF8(&dataText, str_abc, -1, &status);
				2558	matcher->reset(&dataText);
				2559
				2560	const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
				2561	utext_openUTF8(&replText, str_xyz, -1, &status);
				2562	result = matcher->replaceFirst(&replText, NULL, status);
				2563	REGEX_CHECK_STATUS;
				2564	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
				2565	utext_close(result);
				2566	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2567	result = matcher->replaceFirst(&replText, &destText, status);
				2568	REGEX_CHECK_STATUS;
				2569	REGEX_ASSERT(result == &destText);
				2570	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
				2571
				2572	result = matcher->replaceAll(&replText, NULL, status);
				2573	REGEX_CHECK_STATUS;
				2574	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
				2575	utext_close(result);
				2576	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2577	result = matcher->replaceAll(&replText, &destText, status);
				2578	REGEX_CHECK_STATUS;
				2579	REGEX_ASSERT(result == &destText);
				2580	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
				2581
				2582	//
				2583	// Capture Group, simple case
				2584	//
				2585	const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
				2586	utext_openUTF8(&re, str_add, -1, &status);
				2587	RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
				2588	REGEX_CHECK_STATUS;
				2589
				2590	const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
				2591	utext_openUTF8(&dataText, str_abcdefg, -1, &status);
				2592	RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
				2593	REGEX_CHECK_STATUS;
				2594
				2595	const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
				2596	utext_openUTF8(&replText, str_11, -1, &status);
				2597	result = matcher2->replaceFirst(&replText, NULL, status);
				2598	REGEX_CHECK_STATUS;
				2599	const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
				2600	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
				2601	utext_close(result);
				2602	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2603	result = matcher2->replaceFirst(&replText, &destText, status);
				2604	REGEX_CHECK_STATUS;
				2605	REGEX_ASSERT(result == &destText);
				2606	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
				2607
				2608	const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
				2609	utext_openUTF8(&replText, str_v, -1, &status);
				2610	REGEX_VERBOSE_TEXT(&replText);
				2611	result = matcher2->replaceFirst(&replText, NULL, status);
				2612	REGEX_CHECK_STATUS;
				2613	const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
				2614	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
				2615	utext_close(result);
				2616	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2617	result = matcher2->replaceFirst(&replText, &destText, status);
				2618	REGEX_CHECK_STATUS;
				2619	REGEX_ASSERT(result == &destText);
				2620	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
				2621
				2622	const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
				2623	0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
				2624	0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
				2625	utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
				2626	result = matcher2->replaceFirst(&replText, NULL, status);
				2627	REGEX_CHECK_STATUS;
				2628	const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
				2629	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
				2630	utext_close(result);
				2631	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2632	result = matcher2->replaceFirst(&replText, &destText, status);
				2633	REGEX_CHECK_STATUS;
				2634	REGEX_ASSERT(result == &destText);
				2635	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
				2636
				2637	unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
				2638	//unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
				2639	// 012345678901234567890123456
				2640	supplDigitChars[22] = 0xF0;
				2641	supplDigitChars[23] = 0x9D;
				2642	supplDigitChars[24] = 0x9F;
				2643	supplDigitChars[25] = 0x8F;
				2644	utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
				2645
				2646	result = matcher2->replaceFirst(&replText, NULL, status);
				2647	REGEX_CHECK_STATUS;
				2648	const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
				2649	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
				2650	utext_close(result);
				2651	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2652	result = matcher2->replaceFirst(&replText, &destText, status);
				2653	REGEX_CHECK_STATUS;
				2654	REGEX_ASSERT(result == &destText);
				2655	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
				2656	const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
				2657	utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
				2658	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
				2659	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
				2660	utext_close(result);
				2661	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2662	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
				2663	REGEX_ASSERT(result == &destText);
				2664	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
				2665
				2666	//
				2667	// Replacement String with \u hex escapes
				2668	//
				2669	{
				2670	const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
				2671	const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
				2672	utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
				2673	utext_openUTF8(&replText, str_u0043, -1, &status);
				2674	matcher->reset(&dataText);
				2675
				2676	result = matcher->replaceAll(&replText, NULL, status);
				2677	REGEX_CHECK_STATUS;
				2678	const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
				2679	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
				2680	utext_close(result);
				2681	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2682	result = matcher->replaceAll(&replText, &destText, status);
				2683	REGEX_CHECK_STATUS;
				2684	REGEX_ASSERT(result == &destText);
				2685	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
				2686	}
				2687	{
				2688	const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
				2689	utext_openUTF8(&dataText, str_abc, -1, &status);
				2690	const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
				2691	utext_openUTF8(&replText, str_U00010000, -1, &status);
				2692	matcher->reset(&dataText);
				2693
				2694	unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
				2695	// 0123456789
				2696	expected[2] = 0xF0;
				2697	expected[3] = 0x90;
				2698	expected[4] = 0x80;
				2699	expected[5] = 0x80;
				2700
				2701	result = matcher->replaceAll(&replText, NULL, status);
				2702	REGEX_CHECK_STATUS;
				2703	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
				2704	utext_close(result);
				2705	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
				2706	result = matcher->replaceAll(&replText, &destText, status);
				2707	REGEX_CHECK_STATUS;
				2708	REGEX_ASSERT(result == &destText);
				2709	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
				2710	}
				2711	// TODO: need more through testing of capture substitutions.
				2712
				2713	// Bug 4057
				2714	//
				2715	{
				2716	status = U_ZERO_ERROR;
				2717	const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.?)ee /
				2718	const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
				2719	const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
				2720	utext_openUTF8(&re, str_ssee, -1, &status);
				2721	utext_openUTF8(&dataText, str_blah, -1, &status);
				2722	utext_openUTF8(&replText, str_ooh, -1, &status);
				2723
				2724	RegexMatcher m(&re, 0, status);
				2725	REGEX_CHECK_STATUS;
				2726
				2727	UnicodeString result;
				2728	UText resultText = UTEXT_INITIALIZER;
				2729	utext_openUnicodeString(&resultText, &result, &status);
				2730
				2731	// Multiple finds do NOT bump up the previous appendReplacement position.
				2732	m.reset(&dataText);
				2733	m.find();
				2734	m.find();
				2735	m.appendReplacement(&resultText, &replText, status);
				2736	REGEX_CHECK_STATUS;
				2737	const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
				2738	REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
				2739
				2740	// After a reset into the interior of a string, appendReplacement still starts at beginning.
				2741	status = U_ZERO_ERROR;
				2742	result.truncate(0);
				2743	utext_openUnicodeString(&resultText, &result, &status);
				2744	m.reset(10, status);
				2745	m.find();
				2746	m.find();
				2747	m.appendReplacement(&resultText, &replText, status);
				2748	REGEX_CHECK_STATUS;
				2749	const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
				2750	REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
				2751
				2752	// find() at interior of string, appendReplacement still starts at beginning.
				2753	status = U_ZERO_ERROR;
				2754	result.truncate(0);
				2755	utext_openUnicodeString(&resultText, &result, &status);
				2756	m.reset();
				2757	m.find(10, status);
				2758	m.find();
				2759	m.appendReplacement(&resultText, &replText, status);
				2760	REGEX_CHECK_STATUS;
				2761	const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
				2762	REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
				2763
				2764	m.appendTail(&resultText, status);
				2765	const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
				2766	REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
				2767
				2768	utext_close(&resultText);
				2769	}
				2770
				2771	delete matcher2;
				2772	delete pat2;
				2773	delete matcher;
				2774	delete pat;
				2775
				2776	utext_close(&dataText);
				2777	utext_close(&replText);
				2778	utext_close(&destText);
				2779	utext_close(&re);
				2780	}
				2781
				2782
				2783	//---------------------------------------------------------------------------
				2784	//
				2785	// API_Pattern_UTF8 Test that the API for class RegexPattern is
				2786	// present and nominally working.
				2787	//
				2788	//---------------------------------------------------------------------------
				2789	void RegexTest::API_Pattern_UTF8() {
				2790	RegexPattern pata; // Test default constructor to not crash.
				2791	RegexPattern patb;
				2792
				2793	REGEX_ASSERT(pata == patb);
				2794	REGEX_ASSERT(pata == pata);
				2795
				2796	UText re1 = UTEXT_INITIALIZER;
				2797	UText re2 = UTEXT_INITIALIZER;
				2798	UErrorCode status = U_ZERO_ERROR;
				2799	UParseError pe;
				2800
				2801	const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
				2802	const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
				2803	utext_openUTF8(&re1, str_abcalmz, -1, &status);
				2804	utext_openUTF8(&re2, str_def, -1, &status);
				2805
				2806	RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
				2807	RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
				2808	REGEX_CHECK_STATUS;
				2809	REGEX_ASSERT(pat1 == pat1);
				2810	REGEX_ASSERT(*pat1 != pata);
				2811
				2812	// Assign
				2813	patb = *pat1;
				2814	REGEX_ASSERT(patb == *pat1);
				2815
				2816	// Copy Construct
				2817	RegexPattern patc(*pat1);
				2818	REGEX_ASSERT(patc == *pat1);
				2819	REGEX_ASSERT(patb == patc);
				2820	REGEX_ASSERT(pat1 != pat2);
				2821	patb = *pat2;
				2822	REGEX_ASSERT(patb != patc);
				2823	REGEX_ASSERT(patb == *pat2);
				2824
				2825	// Compile with no flags.
				2826	RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
				2827	REGEX_ASSERT(pat1a == pat1);
				2828
				2829	REGEX_ASSERT(pat1a->flags() == 0);
				2830
				2831	// Compile with different flags should be not equal
				2832	RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
				2833	REGEX_CHECK_STATUS;
				2834
				2835	REGEX_ASSERT(pat1b != pat1a);
				2836	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
				2837	REGEX_ASSERT(pat1a->flags() == 0);
				2838	delete pat1b;
				2839
				2840	// clone
				2841	RegexPattern *pat1c = pat1->clone();
				2842	REGEX_ASSERT(pat1c == pat1);
				2843	REGEX_ASSERT(pat1c != pat2);
				2844
				2845	delete pat1c;
				2846	delete pat1a;
				2847	delete pat1;
				2848	delete pat2;
				2849
				2850	utext_close(&re1);
				2851	utext_close(&re2);
				2852
				2853
				2854	//
				2855	// Verify that a matcher created from a cloned pattern works.
				2856	// (Jitterbug 3423)
				2857	//
				2858	{
				2859	UErrorCode status = U_ZERO_ERROR;
				2860	UText pattern = UTEXT_INITIALIZER;
				2861	const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
				2862	utext_openUTF8(&pattern, str_pL, -1, &status);
				2863
				2864	RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
				2865	RegexPattern *pClone = pSource->clone();
				2866	delete pSource;
				2867	RegexMatcher *mFromClone = pClone->matcher(status);
				2868	REGEX_CHECK_STATUS;
				2869
				2870	UText input = UTEXT_INITIALIZER;
				2871	const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
				2872	utext_openUTF8(&input, str_HelloWorld, -1, &status);
				2873	mFromClone->reset(&input);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2874	REGEX_ASSERT(mFromClone->find() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2875	REGEX_ASSERT(mFromClone->group(status) == "Hello");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2876	REGEX_ASSERT(mFromClone->find() == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2877	REGEX_ASSERT(mFromClone->group(status) == "World");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2878	REGEX_ASSERT(mFromClone->find() == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2879	delete mFromClone;
				2880	delete pClone;
				2881
				2882	utext_close(&input);
				2883	utext_close(&pattern);
				2884	}
				2885
				2886	//
				2887	// matches convenience API
				2888	//
				2889	{
				2890	UErrorCode status = U_ZERO_ERROR;
				2891	UText pattern = UTEXT_INITIALIZER;
				2892	UText input = UTEXT_INITIALIZER;
				2893
				2894	const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
				2895	utext_openUTF8(&input, str_randominput, -1, &status);
				2896
				2897	const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
				2898	utext_openUTF8(&pattern, str_dotstar, -1, &status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2899	REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2900	REGEX_CHECK_STATUS;
				2901
				2902	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
				2903	utext_openUTF8(&pattern, str_abc, -1, &status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2904	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2905	REGEX_CHECK_STATUS;
				2906
				2907	const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .nput /
				2908	utext_openUTF8(&pattern, str_nput, -1, &status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2909	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2910	REGEX_CHECK_STATUS;
				2911
				2912	utext_openUTF8(&pattern, str_randominput, -1, &status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2913	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2914	REGEX_CHECK_STATUS;
				2915
				2916	const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .u /
				2917	utext_openUTF8(&pattern, str_u, -1, &status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2918	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2919	REGEX_CHECK_STATUS;
				2920
				2921	utext_openUTF8(&input, str_abc, -1, &status);
				2922	utext_openUTF8(&pattern, str_abc, -1, &status);
				2923	status = U_INDEX_OUTOFBOUNDS_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	2924	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2925	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				2926
				2927	utext_close(&input);
				2928	utext_close(&pattern);
				2929	}
				2930
				2931
				2932	//
				2933	// Split()
				2934	//
				2935	status = U_ZERO_ERROR;
				2936	const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
				2937	utext_openUTF8(&re1, str_spaceplus, -1, &status);
				2938	pat1 = RegexPattern::compile(&re1, pe, status);
				2939	REGEX_CHECK_STATUS;
				2940	UnicodeString fields[10];
				2941
				2942	int32_t n;
				2943	n = pat1->split("Now is the time", fields, 10, status);
				2944	REGEX_CHECK_STATUS;
				2945	REGEX_ASSERT(n==4);
				2946	REGEX_ASSERT(fields[0]=="Now");
				2947	REGEX_ASSERT(fields[1]=="is");
				2948	REGEX_ASSERT(fields[2]=="the");
				2949	REGEX_ASSERT(fields[3]=="time");
				2950	REGEX_ASSERT(fields[4]=="");
				2951
				2952	n = pat1->split("Now is the time", fields, 2, status);
				2953	REGEX_CHECK_STATUS;
				2954	REGEX_ASSERT(n==2);
				2955	REGEX_ASSERT(fields[0]=="Now");
				2956	REGEX_ASSERT(fields[1]=="is the time");
				2957	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
				2958
				2959	fields[1] = "*";
				2960	status = U_ZERO_ERROR;
				2961	n = pat1->split("Now is the time", fields, 1, status);
				2962	REGEX_CHECK_STATUS;
				2963	REGEX_ASSERT(n==1);
				2964	REGEX_ASSERT(fields[0]=="Now is the time");
				2965	REGEX_ASSERT(fields[1]=="*");
				2966	status = U_ZERO_ERROR;
				2967
				2968	n = pat1->split(" Now is the time ", fields, 10, status);
				2969	REGEX_CHECK_STATUS;
				2970	REGEX_ASSERT(n==6);
				2971	REGEX_ASSERT(fields[0]=="");
				2972	REGEX_ASSERT(fields[1]=="Now");
				2973	REGEX_ASSERT(fields[2]=="is");
				2974	REGEX_ASSERT(fields[3]=="the");
				2975	REGEX_ASSERT(fields[4]=="time");
				2976	REGEX_ASSERT(fields[5]=="");
				2977	REGEX_ASSERT(fields[6]=="");
				2978
				2979	fields[2] = "*";
				2980	n = pat1->split(" ", fields, 10, status);
				2981	REGEX_CHECK_STATUS;
				2982	REGEX_ASSERT(n==2);
				2983	REGEX_ASSERT(fields[0]=="");
				2984	REGEX_ASSERT(fields[1]=="");
				2985	REGEX_ASSERT(fields[2]=="*");
				2986
				2987	fields[0] = "foo";
				2988	n = pat1->split("", fields, 10, status);
				2989	REGEX_CHECK_STATUS;
				2990	REGEX_ASSERT(n==0);
				2991	REGEX_ASSERT(fields[0]=="foo");
				2992
				2993	delete pat1;
				2994
				2995	// split, with a pattern with (capture)
				2996	regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
				2997	pat1 = RegexPattern::compile(&re1, pe, status);
				2998	REGEX_CHECK_STATUS;
				2999
				3000	status = U_ZERO_ERROR;
				3001	fields[6] = fields[7] = "*";
				3002	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
				3003	REGEX_CHECK_STATUS;
				3004	REGEX_ASSERT(n==7);
				3005	REGEX_ASSERT(fields[0]=="");
				3006	REGEX_ASSERT(fields[1]=="a");
				3007	REGEX_ASSERT(fields[2]=="Now is ");
				3008	REGEX_ASSERT(fields[3]=="b");
				3009	REGEX_ASSERT(fields[4]=="the time");
				3010	REGEX_ASSERT(fields[5]=="c");
				3011	REGEX_ASSERT(fields[6]=="");
				3012	REGEX_ASSERT(fields[7]=="*");
				3013	REGEX_ASSERT(status==U_ZERO_ERROR);
				3014
				3015	fields[6] = fields[7] = "*";
				3016	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
				3017	REGEX_CHECK_STATUS;
				3018	REGEX_ASSERT(n==7);
				3019	REGEX_ASSERT(fields[0]==" ");
				3020	REGEX_ASSERT(fields[1]=="a");
				3021	REGEX_ASSERT(fields[2]=="Now is ");
				3022	REGEX_ASSERT(fields[3]=="b");
				3023	REGEX_ASSERT(fields[4]=="the time");
				3024	REGEX_ASSERT(fields[5]=="c");
				3025	REGEX_ASSERT(fields[6]=="");
				3026	REGEX_ASSERT(fields[7]=="*");
				3027
				3028	status = U_ZERO_ERROR;
				3029	fields[6] = "foo";
				3030	n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
				3031	REGEX_CHECK_STATUS;
				3032	REGEX_ASSERT(n==6);
				3033	REGEX_ASSERT(fields[0]==" ");
				3034	REGEX_ASSERT(fields[1]=="a");
				3035	REGEX_ASSERT(fields[2]=="Now is ");
				3036	REGEX_ASSERT(fields[3]=="b");
				3037	REGEX_ASSERT(fields[4]=="the time");
				3038	REGEX_ASSERT(fields[5]==" ");
				3039	REGEX_ASSERT(fields[6]=="foo");
				3040
				3041	status = U_ZERO_ERROR;
				3042	fields[5] = "foo";
				3043	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
				3044	REGEX_CHECK_STATUS;
				3045	REGEX_ASSERT(n==5);
				3046	REGEX_ASSERT(fields[0]==" ");
				3047	REGEX_ASSERT(fields[1]=="a");
				3048	REGEX_ASSERT(fields[2]=="Now is ");
				3049	REGEX_ASSERT(fields[3]=="b");
				3050	REGEX_ASSERT(fields[4]=="the time<c>");
				3051	REGEX_ASSERT(fields[5]=="foo");
				3052
				3053	status = U_ZERO_ERROR;
				3054	fields[5] = "foo";
				3055	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
				3056	REGEX_CHECK_STATUS;
				3057	REGEX_ASSERT(n==5);
				3058	REGEX_ASSERT(fields[0]==" ");
				3059	REGEX_ASSERT(fields[1]=="a");
				3060	REGEX_ASSERT(fields[2]=="Now is ");
				3061	REGEX_ASSERT(fields[3]=="b");
				3062	REGEX_ASSERT(fields[4]=="the time");
				3063	REGEX_ASSERT(fields[5]=="foo");
				3064
				3065	status = U_ZERO_ERROR;
				3066	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
				3067	REGEX_CHECK_STATUS;
				3068	REGEX_ASSERT(n==4);
				3069	REGEX_ASSERT(fields[0]==" ");
				3070	REGEX_ASSERT(fields[1]=="a");
				3071	REGEX_ASSERT(fields[2]=="Now is ");
				3072	REGEX_ASSERT(fields[3]=="the time<c>");
				3073	status = U_ZERO_ERROR;
				3074	delete pat1;
				3075
				3076	regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
				3077	pat1 = RegexPattern::compile(&re1, pe, status);
				3078	REGEX_CHECK_STATUS;
				3079	n = pat1->split("1-10,20", fields, 10, status);
				3080	REGEX_CHECK_STATUS;
				3081	REGEX_ASSERT(n==5);
				3082	REGEX_ASSERT(fields[0]=="1");
				3083	REGEX_ASSERT(fields[1]=="-");
				3084	REGEX_ASSERT(fields[2]=="10");
				3085	REGEX_ASSERT(fields[3]==",");
				3086	REGEX_ASSERT(fields[4]=="20");
				3087	delete pat1;
				3088
				3089
				3090	//
				3091	// split of a UText based string, with library allocating output UTexts.
				3092	//
				3093	{
				3094	status = U_ZERO_ERROR;
				3095	RegexMatcher matcher(UnicodeString("(:)"), 0, status);
				3096	UnicodeString stringToSplit("first:second:third");
				3097	UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
				3098	REGEX_CHECK_STATUS;
				3099
				3100	UText *splits[10] = {NULL};
				3101	int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
				3102	REGEX_CHECK_STATUS;
				3103	REGEX_ASSERT(numFields == 5);
				3104	REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
				3105	REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
				3106	REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
				3107	REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
				3108	REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
				3109	REGEX_ASSERT(splits[5] == NULL);
				3110
				3111	for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
				3112	if (splits[i]) {
				3113	utext_close(splits[i]);
				3114	splits[i] = NULL;
				3115	}
				3116	}
				3117	utext_close(textToSplit);
				3118	}
				3119
				3120
				3121	//
				3122	// RegexPattern::pattern() and patternText()
				3123	//
				3124	pat1 = new RegexPattern();
				3125	REGEX_ASSERT(pat1->pattern() == "");
				3126	REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
				3127	delete pat1;
				3128	const char helloWorldInvariant = "(Hello, world)";
				3129	regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
				3130	pat1 = RegexPattern::compile(&re1, pe, status);
				3131	REGEX_CHECK_STATUS;
				3132	REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
				3133	REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
				3134	delete pat1;
				3135
				3136	utext_close(&re1);
				3137	}
				3138
				3139
				3140	//---------------------------------------------------------------------------
				3141	//
				3142	// Extended A more thorough check for features of regex patterns
				3143	// The test cases are in a separate data file,
				3144	// source/tests/testdata/regextst.txt
				3145	// A description of the test data format is included in that file.
				3146	//
				3147	//---------------------------------------------------------------------------
				3148
				3149	const char *
				3150	RegexTest::getPath(char buffer[2048], const char *filename) {
				3151	UErrorCode status=U_ZERO_ERROR;
				3152	const char *testDataDirectory = IntlTest::getSourceTestData(status);
				3153	if (U_FAILURE(status)) {
				3154	errln("ERROR: loadTestData() failed - %s", u_errorName(status));
				3155	return NULL;
				3156	}
				3157
				3158	strcpy(buffer, testDataDirectory);
				3159	strcat(buffer, filename);
				3160	return buffer;
				3161	}
				3162
				3163	void RegexTest::Extended() {
				3164	char tdd[2048];
				3165	const char *srcPath;
				3166	UErrorCode status = U_ZERO_ERROR;
				3167	int32_t lineNum = 0;
				3168
				3169	//
				3170	// Open and read the test data file.
				3171	//
				3172	srcPath=getPath(tdd, "regextst.txt");
				3173	if(srcPath==NULL) {
				3174	return; /* something went wrong, error already output */
				3175	}
				3176
				3177	int32_t len;
				3178	UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
				3179	if (U_FAILURE(status)) {
				3180	return; /* something went wrong, error already output */
				3181	}
				3182
				3183	//
				3184	// Put the test data into a UnicodeString
				3185	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3186	UnicodeString testString(false, testData, len);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3187
				3188	RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s([\\'\\\"/])(.?)\\1"), 0, status);
				3189	RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s(#.)?$"), 0, status);
				3190	RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s([ixsmdteDEGLMQvabtyYzZ2-9])([:letter:]*)"), 0, status);
				3191
				3192	RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
				3193	UnicodeString testPattern; // The pattern for test from the test file.
				3194	UnicodeString testFlags; // the flags for a test.
				3195	UnicodeString matchString; // The marked up string to be used as input
				3196
				3197	if (U_FAILURE(status)){
				3198	dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
				3199	delete [] testData;
				3200	return;
				3201	}
				3202
				3203	//
				3204	// Loop over the test data file, once per line.
				3205	//
				3206	while (lineMat.find()) {
				3207	lineNum++;
				3208	if (U_FAILURE(status)) {
				3209	errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
				3210	}
				3211
				3212	status = U_ZERO_ERROR;
				3213	UnicodeString testLine = lineMat.group(1, status);
				3214	if (testLine.length() == 0) {
				3215	continue;
				3216	}
				3217
				3218	//
				3219	// Parse the test line. Skip blank and comment only lines.
				3220	// Separate out the three main fields - pattern, flags, target.
				3221	//
				3222
				3223	commentMat.reset(testLine);
				3224	if (commentMat.lookingAt(status)) {
				3225	// This line is a comment, or blank.
				3226	continue;
				3227	}
				3228
				3229	//
				3230	// Pull out the pattern field, remove it from the test file line.
				3231	//
				3232	quotedStuffMat.reset(testLine);
				3233	if (quotedStuffMat.lookingAt(status)) {
				3234	testPattern = quotedStuffMat.group(2, status);
				3235	testLine.remove(0, quotedStuffMat.end(0, status));
				3236	} else {
				3237	errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
				3238	continue;
				3239	}
				3240
				3241
				3242	//
				3243	// Pull out the flags from the test file line.
				3244	//
				3245	flagsMat.reset(testLine);
				3246	flagsMat.lookingAt(status); // Will always match, possibly an empty string.
				3247	testFlags = flagsMat.group(1, status);
				3248	if (flagsMat.group(2, status).length() > 0) {
				3249	errln("Bad Match flag at line %d. Scanning %c\n",
				3250	lineNum, flagsMat.group(2, status).charAt(0));
				3251	continue;
				3252	}
				3253	testLine.remove(0, flagsMat.end(0, status));
				3254
				3255	//
				3256	// Pull out the match string, as a whole.
				3257	// We'll process the <tags> later.
				3258	//
				3259	quotedStuffMat.reset(testLine);
				3260	if (quotedStuffMat.lookingAt(status)) {
				3261	matchString = quotedStuffMat.group(2, status);
				3262	testLine.remove(0, quotedStuffMat.end(0, status));
				3263	} else {
				3264	errln("Bad match string at test file line %d", lineNum);
				3265	continue;
				3266	}
				3267
				3268	//
				3269	// The only thing left from the input line should be an optional trailing comment.
				3270	//
				3271	commentMat.reset(testLine);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3272	if (commentMat.lookingAt(status) == false) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3273	errln("Line %d: unexpected characters at end of test line.", lineNum);
				3274	continue;
				3275	}
				3276
				3277	//
				3278	// Run the test
				3279	//
				3280	regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
				3281	}
				3282
				3283	delete [] testData;
				3284
				3285	}
				3286
				3287
				3288
				3289	//---------------------------------------------------------------------------
				3290	//
				3291	// regex_find(pattern, flags, inputString, lineNumber)
				3292	//
				3293	// Function to run a single test from the Extended (data driven) tests.
				3294	// See file test/testdata/regextst.txt for a description of the
				3295	// pattern and inputString fields, and the allowed flags.
				3296	// lineNumber is the source line in regextst.txt of the test.
				3297	//
				3298	//---------------------------------------------------------------------------
				3299
				3300
				3301	// Set a value into a UVector at position specified by a decimal number in
				3302	// a UnicodeString. This is a utility function needed by the actual test function,
				3303	// which follows.
				3304	static void set(UVector &vec, int32_t val, UnicodeString index) {
				3305	UErrorCode status=U_ZERO_ERROR;
				3306	int32_t idx = 0;
				3307	for (int32_t i=0; i<index.length(); i++) {
				3308	int32_t d=u_charDigitValue(index.charAt(i));
				3309	if (d<0) {return;}
				3310	idx = idx*10 + d;
				3311	}
				3312	while (vec.size()<idx+1) {vec.addElement(-1, status);}
				3313	vec.setElementAt(val, idx);
				3314	}
				3315
				3316	static void setInt(UVector &vec, int32_t val, int32_t idx) {
				3317	UErrorCode status=U_ZERO_ERROR;
				3318	while (vec.size()<idx+1) {vec.addElement(-1, status);}
				3319	vec.setElementAt(val, idx);
				3320	}
				3321
				3322	static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
				3323	{
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3324	UBool couldFind = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3325	UTEXT_SETNATIVEINDEX(utext, 0);
				3326	int32_t i = 0;
				3327	while (i < unistrOffset) {
				3328	UChar32 c = UTEXT_NEXT32(utext);
				3329	if (c != U_SENTINEL) {
				3330	i += U16_LENGTH(c);
				3331	} else {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3332	couldFind = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3333	break;
				3334	}
				3335	}
				3336	nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
				3337	return couldFind;
				3338	}
				3339
				3340
				3341	void RegexTest::regex_find(const UnicodeString &pattern,
				3342	const UnicodeString &flags,
				3343	const UnicodeString &inputString,
				3344	const char *srcPath,
				3345	int32_t line) {
				3346	UnicodeString unEscapedInput;
				3347	UnicodeString deTaggedInput;
				3348
				3349	int32_t patternUTF8Length, inputUTF8Length;
				3350	char patternChars = NULL, inputChars = NULL;
				3351	UText patternText = UTEXT_INITIALIZER;
				3352	UText inputText = UTEXT_INITIALIZER;
				3353	UConverter *UTF8Converter = NULL;
				3354
				3355	UErrorCode status = U_ZERO_ERROR;
				3356	UParseError pe;
				3357	RegexPattern *parsePat = NULL;
				3358	RegexMatcher *parseMatcher = NULL;
				3359	RegexPattern callerPattern = NULL, UTF8Pattern = NULL;
				3360	RegexMatcher matcher = NULL, UTF8Matcher = NULL;
				3361	UVector groupStarts(status);
				3362	UVector groupEnds(status);
				3363	UVector groupStartsUTF8(status);
				3364	UVector groupEndsUTF8(status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3365	UBool isMatch = false, isUTF8Match = false;
				3366	UBool failed = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3367	int32_t numFinds;
				3368	int32_t i;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3369	UBool useMatchesFunc = false;
				3370	UBool useLookingAtFunc = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3371	int32_t regionStart = -1;
				3372	int32_t regionEnd = -1;
				3373	int32_t regionStartUTF8 = -1;
				3374	int32_t regionEndUTF8 = -1;
				3375
				3376
				3377	//
				3378	// Compile the caller's pattern
				3379	//
				3380	uint32_t bflags = 0;
				3381	if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
				3382	bflags \|= UREGEX_CASE_INSENSITIVE;
				3383	}
				3384	if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
				3385	bflags \|= UREGEX_COMMENTS;
				3386	}
				3387	if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
				3388	bflags \|= UREGEX_DOTALL;
				3389	}
				3390	if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
				3391	bflags \|= UREGEX_MULTILINE;
				3392	}
				3393
				3394	if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
				3395	bflags \|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
				3396	}
				3397	if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
				3398	bflags \|= UREGEX_UNIX_LINES;
				3399	}
				3400	if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
				3401	bflags \|= UREGEX_LITERAL;
				3402	}
				3403
				3404
				3405	callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
				3406	if (status != U_ZERO_ERROR) {
				3407	#if UCONFIG_NO_BREAK_ITERATION==1
				3408	// 'v' test flag means that the test pattern should not compile if ICU was configured
				3409	// to not include break iteration. RBBI is needed for Unicode word boundaries.
				3410	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
				3411	goto cleanupAndReturn;
				3412	}
				3413	#endif
				3414	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
				3415	// Expected pattern compilation error.
				3416	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
				3417	logln("Pattern Compile returns \"%s\"", u_errorName(status));
				3418	}
				3419	goto cleanupAndReturn;
				3420	} else {
				3421	// Unexpected pattern compilation error.
				3422	dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
				3423	goto cleanupAndReturn;
				3424	}
				3425	}
				3426
				3427	UTF8Converter = ucnv_open("UTF8", &status);
				3428	ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
				3429
				3430	patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
				3431	status = U_ZERO_ERROR; // buffer overflow
				3432	patternChars = new char[patternUTF8Length+1];
				3433	pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
				3434	utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
				3435
				3436	if (status == U_ZERO_ERROR) {
				3437	UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
				3438
				3439	if (status != U_ZERO_ERROR) {
				3440	#if UCONFIG_NO_BREAK_ITERATION==1
				3441	// 'v' test flag means that the test pattern should not compile if ICU was configured
				3442	// to not include break iteration. RBBI is needed for Unicode word boundaries.
				3443	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
				3444	goto cleanupAndReturn;
				3445	}
				3446	#endif
				3447	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
				3448	// Expected pattern compilation error.
				3449	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
				3450	logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
				3451	}
				3452	goto cleanupAndReturn;
				3453	} else {
				3454	// Unexpected pattern compilation error.
				3455	errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
				3456	goto cleanupAndReturn;
				3457	}
				3458	}
				3459	}
				3460
				3461	if (UTF8Pattern == NULL) {
				3462	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
				3463	logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
				3464	status = U_ZERO_ERROR;
				3465	}
				3466
				3467	if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
				3468	callerPattern->dumpPattern();
				3469	}
				3470
				3471	if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
				3472	errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
				3473	goto cleanupAndReturn;
				3474	}
				3475
				3476
				3477	//
				3478	// Number of times find() should be called on the test string, default to 1
				3479	//
				3480	numFinds = 1;
				3481	for (i=2; i<=9; i++) {
				3482	if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
				3483	if (numFinds != 1) {
				3484	errln("Line %d: more than one digit flag. Scanning %d.", line, i);
				3485	goto cleanupAndReturn;
				3486	}
				3487	numFinds = i;
				3488	}
				3489	}
				3490
				3491	// 'M' flag. Use matches() instead of find()
				3492	if (flags.indexOf((UChar)0x4d) >= 0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3493	useMatchesFunc = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3494	}
				3495	if (flags.indexOf((UChar)0x4c) >= 0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3496	useLookingAtFunc = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3497	}
				3498
				3499	//
				3500	// Find the tags in the input data, remove them, and record the group boundary
				3501	// positions.
				3502	//
				3503	parsePat = RegexPattern::compile("<(/?)(r\|[0-9]+)>", 0, pe, status);
				3504	if (!assertSuccess(WHERE, status) ) {
				3505	goto cleanupAndReturn;
				3506	}
				3507
				3508	unEscapedInput = inputString.unescape();
				3509	parseMatcher = parsePat->matcher(unEscapedInput, status);
				3510	if (!assertSuccess(WHERE, status) ) {
				3511	goto cleanupAndReturn;
				3512	}
				3513	while(parseMatcher->find()) {
				3514	parseMatcher->appendReplacement(deTaggedInput, "", status);
				3515	REGEX_CHECK_STATUS;
				3516	UnicodeString groupNum = parseMatcher->group(2, status);
				3517	if (groupNum == "r") {
				3518	// <r> or </r>, a region specification within the string
				3519	if (parseMatcher->group(1, status) == "/") {
				3520	regionEnd = deTaggedInput.length();
				3521	} else {
				3522	regionStart = deTaggedInput.length();
				3523	}
				3524	} else {
				3525	// <digits> or </digits>, a group match boundary tag.
				3526	if (parseMatcher->group(1, status) == "/") {
				3527	set(groupEnds, deTaggedInput.length(), groupNum);
				3528	} else {
				3529	set(groupStarts, deTaggedInput.length(), groupNum);
				3530	}
				3531	}
				3532	}
				3533	parseMatcher->appendTail(deTaggedInput);
				3534
				3535	if (groupStarts.size() != groupEnds.size()) {
				3536	errln("Error at line %d: mismatched <n> group tags in expected results.", line);
				3537	failed = true;
				3538	goto cleanupAndReturn;
				3539	}
				3540	if ((regionStart>=0 \|\| regionEnd>=0) && (regionStart<0 \|\| regionStart>regionEnd)) {
				3541	errln("mismatched <r> tags");
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3542	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3543	goto cleanupAndReturn;
				3544	}
				3545
				3546	//
				3547	// Configure the matcher according to the flags specified with this test.
				3548	//
				3549	matcher = callerPattern->matcher(deTaggedInput, status);
				3550	REGEX_CHECK_STATUS_L(line);
				3551	if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3552	matcher->setTrace(true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3553	}
				3554
				3555	if (UTF8Pattern != NULL) {
				3556	inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
				3557	status = U_ZERO_ERROR; // buffer overflow
				3558	inputChars = new char[inputUTF8Length+1];
				3559	deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
				3560	utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
				3561
				3562	if (status == U_ZERO_ERROR) {
				3563	UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
				3564	REGEX_CHECK_STATUS_L(line);
				3565	}
				3566
				3567	if (UTF8Matcher == NULL) {
				3568	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
				3569	logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
				3570	status = U_ZERO_ERROR;
				3571	}
				3572	}
				3573
				3574	//
				3575	// Generate native indices for UTF8 versions of region and capture group info
				3576	//
				3577	if (UTF8Matcher != NULL) {
				3578	if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3579	UTF8Matcher->setTrace(true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3580	}
				3581	if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
				3582	if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
				3583
				3584	// Fill out the native index UVector info.
				3585	// Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
				3586	for (i=0; i<groupStarts.size(); i++) {
				3587	int32_t start = groupStarts.elementAti(i);
				3588	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
				3589	if (start >= 0) {
				3590	int32_t startUTF8;
				3591	if (!utextOffsetToNative(&inputText, start, startUTF8)) {
				3592	errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3593	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3594	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
				3595	}
				3596	setInt(groupStartsUTF8, startUTF8, i);
				3597	}
				3598
				3599	int32_t end = groupEnds.elementAti(i);
				3600	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
				3601	if (end >= 0) {
				3602	int32_t endUTF8;
				3603	if (!utextOffsetToNative(&inputText, end, endUTF8)) {
				3604	errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3605	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3606	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
				3607	}
				3608	setInt(groupEndsUTF8, endUTF8, i);
				3609	}
				3610	}
				3611	}
				3612
				3613	if (regionStart>=0) {
				3614	matcher->region(regionStart, regionEnd, status);
				3615	REGEX_CHECK_STATUS_L(line);
				3616	if (UTF8Matcher != NULL) {
				3617	UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
				3618	REGEX_CHECK_STATUS_L(line);
				3619	}
				3620	}
				3621	if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3622	matcher->useAnchoringBounds(false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3623	if (UTF8Matcher != NULL) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3624	UTF8Matcher->useAnchoringBounds(false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3625	}
				3626	}
				3627	if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3628	matcher->useTransparentBounds(true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3629	if (UTF8Matcher != NULL) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3630	UTF8Matcher->useTransparentBounds(true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3631	}
				3632	}
				3633
				3634
				3635
				3636	//
				3637	// Do a find on the de-tagged input using the caller's pattern
				3638	// TODO: error on count>1 and not find().
				3639	// error on both matches() and lookingAt().
				3640	//
				3641	for (i=0; i<numFinds; i++) {
				3642	if (useMatchesFunc) {
				3643	isMatch = matcher->matches(status);
				3644	if (UTF8Matcher != NULL) {
				3645	isUTF8Match = UTF8Matcher->matches(status);
				3646	}
				3647	} else if (useLookingAtFunc) {
				3648	isMatch = matcher->lookingAt(status);
				3649	if (UTF8Matcher != NULL) {
				3650	isUTF8Match = UTF8Matcher->lookingAt(status);
				3651	}
				3652	} else {
				3653	isMatch = matcher->find();
				3654	if (UTF8Matcher != NULL) {
				3655	isUTF8Match = UTF8Matcher->find();
				3656	}
				3657	}
				3658	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3659	matcher->setTrace(false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3660	if (UTF8Matcher) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3661	UTF8Matcher->setTrace(false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3662	}
				3663	if (U_FAILURE(status)) {
				3664	errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
				3665	}
				3666
				3667	//
				3668	// Match up the groups from the find() with the groups from the tags
				3669	//
				3670
				3671	// number of tags should match number of groups from find operation.
				3672	// matcher->groupCount does not include group 0, the entire match, hence the +1.
				3673	// G option in test means that capture group data is not available in the
				3674	// expected results, so the check needs to be suppressed.
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3675	if (isMatch == false && groupStarts.size() != 0) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3676	dataerrln("Error at line %d: Match expected, but none found.", line);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3677	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3678	goto cleanupAndReturn;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3679	} else if (UTF8Matcher != NULL && isUTF8Match == false && groupStarts.size() != 0) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3680	errln("Error at line %d: Match expected, but none found. (UTF8)", line);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3681	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3682	goto cleanupAndReturn;
				3683	}
				3684	if (isMatch && groupStarts.size() == 0) {
				3685	errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3686	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3687	}
				3688	if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
				3689	errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3690	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3691	}
				3692
				3693	if (flags.indexOf((UChar)0x47 /G/) >= 0) {
				3694	// Only check for match / no match. Don't check capture groups.
				3695	goto cleanupAndReturn;
				3696	}
				3697
				3698	REGEX_CHECK_STATUS_L(line);
				3699	for (i=0; i<=matcher->groupCount(); i++) {
				3700	int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
				3701	int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
				3702	if (matcher->start(i, status) != expectedStart) {
				3703	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
				3704	line, i, expectedStart, matcher->start(i, status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3705	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3706	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
				3707	} else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
				3708	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
				3709	line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3710	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3711	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
				3712	}
				3713
				3714	int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
				3715	int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
				3716	if (matcher->end(i, status) != expectedEnd) {
				3717	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
				3718	line, i, expectedEnd, matcher->end(i, status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3719	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3720	// Error on end position; keep going; real error is probably yet to come as group
				3721	// end positions work from end of the input data towards the front.
				3722	} else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
				3723	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
				3724	line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3725	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3726	// Error on end position; keep going; real error is probably yet to come as group
				3727	// end positions work from end of the input data towards the front.
				3728	}
				3729	}
				3730	if ( matcher->groupCount()+1 < groupStarts.size()) {
				3731	errln("Error at line %d: Expected %d capture groups, found %d.",
				3732	line, groupStarts.size()-1, matcher->groupCount());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3733	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3734	}
				3735	else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
				3736	errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
				3737	line, groupStarts.size()-1, UTF8Matcher->groupCount());
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3738	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3739	}
				3740
				3741	if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3742	matcher->requireEnd() == true) {
				3743	errln("Error at line %d: requireEnd() returned true. Expected false", line);
				3744	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3745	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3746	UTF8Matcher->requireEnd() == true) {
				3747	errln("Error at line %d: requireEnd() returned true. Expected false (UTF8)", line);
				3748	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3749	}
				3750
				3751	if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3752	matcher->requireEnd() == false) {
				3753	errln("Error at line %d: requireEnd() returned false. Expected true", line);
				3754	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3755	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3756	UTF8Matcher->requireEnd() == false) {
				3757	errln("Error at line %d: requireEnd() returned false. Expected true (UTF8)", line);
				3758	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3759	}
				3760
				3761	if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3762	matcher->hitEnd() == true) {
				3763	errln("Error at line %d: hitEnd() returned true. Expected false", line);
				3764	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3765	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3766	UTF8Matcher->hitEnd() == true) {
				3767	errln("Error at line %d: hitEnd() returned true. Expected false (UTF8)", line);
				3768	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3769	}
				3770
				3771	if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3772	matcher->hitEnd() == false) {
				3773	errln("Error at line %d: hitEnd() returned false. Expected true", line);
				3774	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3775	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3776	UTF8Matcher->hitEnd() == false) {
				3777	errln("Error at line %d: hitEnd() returned false. Expected true (UTF8)", line);
				3778	failed = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3779	}
				3780
				3781
				3782	cleanupAndReturn:
				3783	if (failed) {
				3784	infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
				3785	+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
				3786	// callerPattern->dump();
				3787	}
				3788	delete parseMatcher;
				3789	delete parsePat;
				3790	delete UTF8Matcher;
				3791	delete UTF8Pattern;
				3792	delete matcher;
				3793	delete callerPattern;
				3794
				3795	utext_close(&inputText);
				3796	delete[] inputChars;
				3797	utext_close(&patternText);
				3798	delete[] patternChars;
				3799	ucnv_close(UTF8Converter);
				3800	}
				3801
				3802
				3803
				3804
				3805	//---------------------------------------------------------------------------
				3806	//
				3807	// Errors Check for error handling in patterns.
				3808	//
				3809	//---------------------------------------------------------------------------
				3810	void RegexTest::Errors() {
				3811	// \escape sequences that aren't implemented yet.
				3812	//REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
				3813
				3814	// Missing close parentheses
				3815	REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
				3816	REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
				3817	REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
				3818
				3819	// Extra close paren
				3820	REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
				3821	REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
				3822	REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
				3823
				3824	// Look-ahead, Look-behind
				3825	// TODO: add tests for unbounded length look-behinds.
				3826	REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
				3827
				3828	// Attempt to use non-default flags
				3829	{
				3830	UParseError pe;
				3831	UErrorCode status = U_ZERO_ERROR;
				3832	int32_t flags = UREGEX_CANON_EQ \|
				3833	UREGEX_COMMENTS \| UREGEX_DOTALL \|
				3834	UREGEX_MULTILINE;
				3835	RegexPattern pat1= RegexPattern::compile(".", flags, pe, status);
				3836	REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
				3837	delete pat1;
				3838	}
				3839
				3840
				3841	// Quantifiers are allowed only after something that can be quantified.
				3842	REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
				3843	REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
				3844	REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
				3845
				3846	// Mal-formed {min,max} quantifiers
				3847	REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
				3848	REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
				3849	REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
				3850	REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
				3851	REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
				3852	REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
				3853	REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
				3854	REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
				3855	REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
				3856
				3857	// Ticket 5389
				3858	REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
				3859
				3860	// Invalid Back Reference \0
				3861	// For ICU 3.8 and earlier
				3862	// For ICU versions newer than 3.8, \0 introduces an octal escape.
				3863	//
				3864	REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
				3865
				3866	}
				3867
				3868	//-------------------------------------------------------------------------------
				3869	//
				3870	// PerlTests - Run Perl's regular expression tests
				3871	// The input file for this test is re_tests, the standard regular
				3872	// expression test data distributed with the Perl source code.
				3873	//
				3874	// Here is Perl's description of the test data file:
				3875	//
				3876	// # The tests are in a separate file 't/op/re_tests'.
				3877	// # Each line in that file is a separate test.
				3878	// # There are five columns, separated by tabs.
				3879	// #
				3880	// # Column 1 contains the pattern, optionally enclosed in C<''>.
				3881	// # Modifiers can be put after the closing C<'>.
				3882	// #
				3883	// # Column 2 contains the string to be matched.
				3884	// #
				3885	// # Column 3 contains the expected result:
				3886	// # y expect a match
				3887	// # n expect no match
				3888	// # c expect an error
				3889	// # B test exposes a known bug in Perl, should be skipped
				3890	// # b test exposes a known bug in Perl, should be skipped if noamp
				3891	// #
				3892	// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
				3893	// #
				3894	// # Column 4 contains a string, usually C<$&>.
				3895	// #
				3896	// # Column 5 contains the expected result of double-quote
				3897	// # interpolating that string after the match, or start of error message.
				3898	// #
				3899	// # Column 6, if present, contains a reason why the test is skipped.
				3900	// # This is printed with "skipped", for harness to pick up.
				3901	// #
				3902	// # \n in the tests are interpolated, as are variables of the form ${\w+}.
				3903	// #
				3904	// # If you want to add a regular expression test that can't be expressed
				3905	// # in this format, don't add it here: put it in op/pat.t instead.
				3906	//
				3907	// For ICU, if field 3 contains an 'i', the test will be skipped.
				3908	// The test exposes is some known incompatibility between ICU and Perl regexps.
				3909	// (The i is in addition to whatever was there before.)
				3910	//
				3911	//-------------------------------------------------------------------------------
				3912	void RegexTest::PerlTests() {
				3913	char tdd[2048];
				3914	const char *srcPath;
				3915	UErrorCode status = U_ZERO_ERROR;
				3916	UParseError pe;
				3917
				3918	//
				3919	// Open and read the test data file.
				3920	//
				3921	srcPath=getPath(tdd, "re_tests.txt");
				3922	if(srcPath==NULL) {
				3923	return; /* something went wrong, error already output */
				3924	}
				3925
				3926	int32_t len;
				3927	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
				3928	if (U_FAILURE(status)) {
				3929	return; /* something went wrong, error already output */
				3930	}
				3931
				3932	//
				3933	// Put the test data into a UnicodeString
				3934	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	3935	UnicodeString testDataString(false, testData, len);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	3936
				3937	//
				3938	// Regex to break the input file into lines, and strip the new lines.
				3939	// One line per match, capture group one is the desired data.
				3940	//
				3941	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
				3942	if (U_FAILURE(status)) {
				3943	dataerrln("RegexPattern::compile() error");
				3944	return;
				3945	}
				3946	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
				3947
				3948	//
				3949	// Regex to split a test file line into fields.
				3950	// There are six fields, separated by tabs.
				3951	//
				3952	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
				3953
				3954	//
				3955	// Regex to identify test patterns with flag settings, and to separate them.
				3956	// Test patterns with flags look like 'pattern'i
				3957	// Test patterns without flags are not quoted: pattern
				3958	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
				3959	//
				3960	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
				3961	RegexMatcher* flagMat = flagPat->matcher(status);
				3962
				3963	//
				3964	// The Perl tests reference several perl-isms, which are evaluated/substituted
				3965	// in the test data. Not being perl, this must be done explicitly. Here
				3966	// are string constants and REs for these constructs.
				3967	//
				3968	UnicodeString nulnulSrc("${nulnul}");
				3969	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
				3970	nulnul = nulnul.unescape();
				3971
				3972	UnicodeString ffffSrc("${ffff}");
				3973	UnicodeString ffff("\\uffff", -1, US_INV);
				3974	ffff = ffff.unescape();
				3975
				3976	// regexp for $-[0], $+[2], etc.
				3977	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
				3978	RegexMatcher *groupsMat = groupsPat->matcher(status);
				3979
				3980	// regexp for $0, $1, $2, etc.
				3981	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
				3982	RegexMatcher *cgMat = cgPat->matcher(status);
				3983
				3984
				3985	//
				3986	// Main Loop for the Perl Tests, runs once per line from the
				3987	// test data file.
				3988	//
				3989	int32_t lineNum = 0;
				3990	int32_t skippedUnimplementedCount = 0;
				3991	while (lineMat->find()) {
				3992	lineNum++;
				3993
				3994	//
				3995	// Get a line, break it into its fields, do the Perl
				3996	// variable substitutions.
				3997	//
				3998	UnicodeString line = lineMat->group(1, status);
				3999	UnicodeString fields[7];
				4000	fieldPat->split(line, fields, 7, status);
				4001
				4002	flagMat->reset(fields[0]);
				4003	flagMat->matches(status);
				4004	UnicodeString pattern = flagMat->group(2, status);
				4005	pattern.findAndReplace("${bang}", "!");
				4006	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
				4007	pattern.findAndReplace(ffffSrc, ffff);
				4008
				4009	//
				4010	// Identify patterns that include match flag settings,
				4011	// split off the flags, remove the extra quotes.
				4012	//
				4013	UnicodeString flagStr = flagMat->group(3, status);
				4014	if (U_FAILURE(status)) {
				4015	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
				4016	return;
				4017	}
				4018	int32_t flags = 0;
				4019	const UChar UChar_c = 0x63; // Char constants for the flag letters.
				4020	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
				4021	const UChar UChar_m = 0x6d;
				4022	const UChar UChar_x = 0x78;
				4023	const UChar UChar_y = 0x79;
				4024	if (flagStr.indexOf(UChar_i) != -1) {
				4025	flags \|= UREGEX_CASE_INSENSITIVE;
				4026	}
				4027	if (flagStr.indexOf(UChar_m) != -1) {
				4028	flags \|= UREGEX_MULTILINE;
				4029	}
				4030	if (flagStr.indexOf(UChar_x) != -1) {
				4031	flags \|= UREGEX_COMMENTS;
				4032	}
				4033
				4034	//
				4035	// Compile the test pattern.
				4036	//
				4037	status = U_ZERO_ERROR;
				4038	RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
				4039	if (status == U_REGEX_UNIMPLEMENTED) {
				4040	//
				4041	// Test of a feature that is planned for ICU, but not yet implemented.
				4042	// skip the test.
				4043	skippedUnimplementedCount++;
				4044	delete testPat;
				4045	status = U_ZERO_ERROR;
				4046	continue;
				4047	}
				4048
				4049	if (U_FAILURE(status)) {
				4050	// Some tests are supposed to generate errors.
				4051	// Only report an error for tests that are supposed to succeed.
				4052	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
				4053	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
				4054	{
				4055	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
				4056	}
				4057	status = U_ZERO_ERROR;
				4058	delete testPat;
				4059	continue;
				4060	}
				4061
				4062	if (fields[2].indexOf(UChar_i) >= 0) {
				4063	// ICU should skip this test.
				4064	delete testPat;
				4065	continue;
				4066	}
				4067
				4068	if (fields[2].indexOf(UChar_c) >= 0) {
				4069	// This pattern should have caused a compilation error, but didn't/
				4070	errln("line %d: Expected a pattern compile error, got success.", lineNum);
				4071	delete testPat;
				4072	continue;
				4073	}
				4074
				4075	//
				4076	// replace the Perl variables that appear in some of the
				4077	// match data strings.
				4078	//
				4079	UnicodeString matchString = fields[1];
				4080	matchString.findAndReplace(nulnulSrc, nulnul);
				4081	matchString.findAndReplace(ffffSrc, ffff);
				4082
				4083	// Replace any \n in the match string with an actual new-line char.
				4084	// Don't do full unescape, as this unescapes more than Perl does, which
				4085	// causes other spurious failures in the tests.
				4086	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
				4087
				4088
				4089
				4090	//
				4091	// Run the test, check for expected match/don't match result.
				4092	//
				4093	RegexMatcher *testMat = testPat->matcher(matchString, status);
				4094	UBool found = testMat->find();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4095	UBool expected = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4096	if (fields[2].indexOf(UChar_y) >=0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4097	expected = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4098	}
				4099	if (expected != found) {
				4100	errln("line %d: Expected %smatch, got %smatch",
				4101	lineNum, expected?"":"no ", found?"":"no " );
				4102	delete testMat;
				4103	delete testPat;
				4104	continue;
				4105	}
				4106
				4107	// Don't try to check expected results if there is no match.
				4108	// (Some have stuff in the expected fields)
				4109	if (!found) {
				4110	delete testMat;
				4111	delete testPat;
				4112	continue;
				4113	}
				4114
				4115	//
				4116	// Interpret the Perl expression from the fourth field of the data file,
				4117	// building up an ICU string from the results of the ICU match.
				4118	// The Perl expression will contain references to the results of
				4119	// a regex match, including the matched string, capture group strings,
				4120	// group starting and ending indices, etc.
				4121	//
				4122	UnicodeString resultString;
				4123	UnicodeString perlExpr = fields[3];
				4124	#if SUPPORT_MUTATING_INPUT_STRING
				4125	groupsMat->reset(perlExpr);
				4126	cgMat->reset(perlExpr);
				4127	#endif
				4128
				4129	while (perlExpr.length() > 0) {
				4130	#if !SUPPORT_MUTATING_INPUT_STRING
				4131	// Preferred usage. Reset after any modification to input string.
				4132	groupsMat->reset(perlExpr);
				4133	cgMat->reset(perlExpr);
				4134	#endif
				4135
				4136	if (perlExpr.startsWith("$&")) {
				4137	resultString.append(testMat->group(status));
				4138	perlExpr.remove(0, 2);
				4139	}
				4140
				4141	else if (groupsMat->lookingAt(status)) {
				4142	// $-[0] $+[2] etc.
				4143	UnicodeString digitString = groupsMat->group(2, status);
				4144	int32_t t = 0;
				4145	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
				4146	UnicodeString plusOrMinus = groupsMat->group(1, status);
				4147	int32_t matchPosition;
				4148	if (plusOrMinus.compare("+") == 0) {
				4149	matchPosition = testMat->end(groupNum, status);
				4150	} else {
				4151	matchPosition = testMat->start(groupNum, status);
				4152	}
				4153	if (matchPosition != -1) {
				4154	ICU_Utility::appendNumber(resultString, matchPosition);
				4155	}
				4156	perlExpr.remove(0, groupsMat->end(status));
				4157	}
				4158
				4159	else if (cgMat->lookingAt(status)) {
				4160	// $1, $2, $3, etc.
				4161	UnicodeString digitString = cgMat->group(1, status);
				4162	int32_t t = 0;
				4163	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
				4164	if (U_SUCCESS(status)) {
				4165	resultString.append(testMat->group(groupNum, status));
				4166	status = U_ZERO_ERROR;
				4167	}
				4168	perlExpr.remove(0, cgMat->end(status));
				4169	}
				4170
				4171	else if (perlExpr.startsWith("@-")) {
				4172	int32_t i;
				4173	for (i=0; i<=testMat->groupCount(); i++) {
				4174	if (i>0) {
				4175	resultString.append(" ");
				4176	}
				4177	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
				4178	}
				4179	perlExpr.remove(0, 2);
				4180	}
				4181
				4182	else if (perlExpr.startsWith("@+")) {
				4183	int32_t i;
				4184	for (i=0; i<=testMat->groupCount(); i++) {
				4185	if (i>0) {
				4186	resultString.append(" ");
				4187	}
				4188	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
				4189	}
				4190	perlExpr.remove(0, 2);
				4191	}
				4192
				4193	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
				4194	// or as an escaped sequence (e.g. \n)
				4195	if (perlExpr.length() > 1) {
				4196	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
				4197	}
				4198	UChar c = perlExpr.charAt(0);
				4199	switch (c) {
				4200	case 'n': c = '\n'; break;
				4201	// add any other escape sequences that show up in the test expected results.
				4202	}
				4203	resultString.append(c);
				4204	perlExpr.remove(0, 1);
				4205	}
				4206
				4207	else {
				4208	// Any characters from the perl expression that we don't explicitly
				4209	// recognize before here are assumed to be literals and copied
				4210	// as-is to the expected results.
				4211	resultString.append(perlExpr.charAt(0));
				4212	perlExpr.remove(0, 1);
				4213	}
				4214
				4215	if (U_FAILURE(status)) {
				4216	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
				4217	break;
				4218	}
				4219	}
				4220
				4221	//
				4222	// Expected Results Compare
				4223	//
				4224	UnicodeString expectedS(fields[4]);
				4225	expectedS.findAndReplace(nulnulSrc, nulnul);
				4226	expectedS.findAndReplace(ffffSrc, ffff);
				4227	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
				4228
				4229
				4230	if (expectedS.compare(resultString) != 0) {
				4231	err("Line %d: Incorrect perl expression results.", lineNum);
				4232	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
				4233	}
				4234
				4235	delete testMat;
				4236	delete testPat;
				4237	}
				4238
				4239	//
				4240	// All done. Clean up allocated stuff.
				4241	//
				4242	delete cgMat;
				4243	delete cgPat;
				4244
				4245	delete groupsMat;
				4246	delete groupsPat;
				4247
				4248	delete flagMat;
				4249	delete flagPat;
				4250
				4251	delete lineMat;
				4252	delete linePat;
				4253
				4254	delete fieldPat;
				4255	delete [] testData;
				4256
				4257
				4258	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
				4259
				4260	}
				4261
				4262
				4263	//-------------------------------------------------------------------------------
				4264	//
				4265	// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
				4266	// (instead of using UnicodeStrings) to test the alternate engine.
				4267	// The input file for this test is re_tests, the standard regular
				4268	// expression test data distributed with the Perl source code.
				4269	// See PerlTests() for more information.
				4270	//
				4271	//-------------------------------------------------------------------------------
				4272	void RegexTest::PerlTestsUTF8() {
				4273	char tdd[2048];
				4274	const char *srcPath;
				4275	UErrorCode status = U_ZERO_ERROR;
				4276	UParseError pe;
				4277	LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
				4278	UText patternText = UTEXT_INITIALIZER;
				4279	char *patternChars = NULL;
				4280	int32_t patternLength;
				4281	int32_t patternCapacity = 0;
				4282	UText inputText = UTEXT_INITIALIZER;
				4283	char *inputChars = NULL;
				4284	int32_t inputLength;
				4285	int32_t inputCapacity = 0;
				4286
				4287	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
				4288
				4289	//
				4290	// Open and read the test data file.
				4291	//
				4292	srcPath=getPath(tdd, "re_tests.txt");
				4293	if(srcPath==NULL) {
				4294	return; /* something went wrong, error already output */
				4295	}
				4296
				4297	int32_t len;
				4298	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
				4299	if (U_FAILURE(status)) {
				4300	return; /* something went wrong, error already output */
				4301	}
				4302
				4303	//
				4304	// Put the test data into a UnicodeString
				4305	//
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4306	UnicodeString testDataString(false, testData, len);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4307
				4308	//
				4309	// Regex to break the input file into lines, and strip the new lines.
				4310	// One line per match, capture group one is the desired data.
				4311	//
				4312	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
				4313	if (U_FAILURE(status)) {
				4314	dataerrln("RegexPattern::compile() error");
				4315	return;
				4316	}
				4317	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
				4318
				4319	//
				4320	// Regex to split a test file line into fields.
				4321	// There are six fields, separated by tabs.
				4322	//
				4323	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
				4324
				4325	//
				4326	// Regex to identify test patterns with flag settings, and to separate them.
				4327	// Test patterns with flags look like 'pattern'i
				4328	// Test patterns without flags are not quoted: pattern
				4329	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
				4330	//
				4331	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
				4332	RegexMatcher* flagMat = flagPat->matcher(status);
				4333
				4334	//
				4335	// The Perl tests reference several perl-isms, which are evaluated/substituted
				4336	// in the test data. Not being perl, this must be done explicitly. Here
				4337	// are string constants and REs for these constructs.
				4338	//
				4339	UnicodeString nulnulSrc("${nulnul}");
				4340	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
				4341	nulnul = nulnul.unescape();
				4342
				4343	UnicodeString ffffSrc("${ffff}");
				4344	UnicodeString ffff("\\uffff", -1, US_INV);
				4345	ffff = ffff.unescape();
				4346
				4347	// regexp for $-[0], $+[2], etc.
				4348	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
				4349	RegexMatcher *groupsMat = groupsPat->matcher(status);
				4350
				4351	// regexp for $0, $1, $2, etc.
				4352	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
				4353	RegexMatcher *cgMat = cgPat->matcher(status);
				4354
				4355
				4356	//
				4357	// Main Loop for the Perl Tests, runs once per line from the
				4358	// test data file.
				4359	//
				4360	int32_t lineNum = 0;
				4361	int32_t skippedUnimplementedCount = 0;
				4362	while (lineMat->find()) {
				4363	lineNum++;
				4364
				4365	//
				4366	// Get a line, break it into its fields, do the Perl
				4367	// variable substitutions.
				4368	//
				4369	UnicodeString line = lineMat->group(1, status);
				4370	UnicodeString fields[7];
				4371	fieldPat->split(line, fields, 7, status);
				4372
				4373	flagMat->reset(fields[0]);
				4374	flagMat->matches(status);
				4375	UnicodeString pattern = flagMat->group(2, status);
				4376	pattern.findAndReplace("${bang}", "!");
				4377	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
				4378	pattern.findAndReplace(ffffSrc, ffff);
				4379
				4380	//
				4381	// Identify patterns that include match flag settings,
				4382	// split off the flags, remove the extra quotes.
				4383	//
				4384	UnicodeString flagStr = flagMat->group(3, status);
				4385	if (U_FAILURE(status)) {
				4386	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
				4387	return;
				4388	}
				4389	int32_t flags = 0;
				4390	const UChar UChar_c = 0x63; // Char constants for the flag letters.
				4391	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
				4392	const UChar UChar_m = 0x6d;
				4393	const UChar UChar_x = 0x78;
				4394	const UChar UChar_y = 0x79;
				4395	if (flagStr.indexOf(UChar_i) != -1) {
				4396	flags \|= UREGEX_CASE_INSENSITIVE;
				4397	}
				4398	if (flagStr.indexOf(UChar_m) != -1) {
				4399	flags \|= UREGEX_MULTILINE;
				4400	}
				4401	if (flagStr.indexOf(UChar_x) != -1) {
				4402	flags \|= UREGEX_COMMENTS;
				4403	}
				4404
				4405	//
				4406	// Put the pattern in a UTF-8 UText
				4407	//
				4408	status = U_ZERO_ERROR;
				4409	patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
				4410	if (status == U_BUFFER_OVERFLOW_ERROR) {
				4411	status = U_ZERO_ERROR;
				4412	delete[] patternChars;
				4413	patternCapacity = patternLength + 1;
				4414	patternChars = new char[patternCapacity];
				4415	pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
				4416	}
				4417	utext_openUTF8(&patternText, patternChars, patternLength, &status);
				4418
				4419	//
				4420	// Compile the test pattern.
				4421	//
				4422	RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
				4423	if (status == U_REGEX_UNIMPLEMENTED) {
				4424	//
				4425	// Test of a feature that is planned for ICU, but not yet implemented.
				4426	// skip the test.
				4427	skippedUnimplementedCount++;
				4428	delete testPat;
				4429	status = U_ZERO_ERROR;
				4430	continue;
				4431	}
				4432
				4433	if (U_FAILURE(status)) {
				4434	// Some tests are supposed to generate errors.
				4435	// Only report an error for tests that are supposed to succeed.
				4436	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
				4437	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
				4438	{
				4439	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
				4440	}
				4441	status = U_ZERO_ERROR;
				4442	delete testPat;
				4443	continue;
				4444	}
				4445
				4446	if (fields[2].indexOf(UChar_i) >= 0) {
				4447	// ICU should skip this test.
				4448	delete testPat;
				4449	continue;
				4450	}
				4451
				4452	if (fields[2].indexOf(UChar_c) >= 0) {
				4453	// This pattern should have caused a compilation error, but didn't/
				4454	errln("line %d: Expected a pattern compile error, got success.", lineNum);
				4455	delete testPat;
				4456	continue;
				4457	}
				4458
				4459
				4460	//
				4461	// replace the Perl variables that appear in some of the
				4462	// match data strings.
				4463	//
				4464	UnicodeString matchString = fields[1];
				4465	matchString.findAndReplace(nulnulSrc, nulnul);
				4466	matchString.findAndReplace(ffffSrc, ffff);
				4467
				4468	// Replace any \n in the match string with an actual new-line char.
				4469	// Don't do full unescape, as this unescapes more than Perl does, which
				4470	// causes other spurious failures in the tests.
				4471	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
				4472
				4473	//
				4474	// Put the input in a UTF-8 UText
				4475	//
				4476	status = U_ZERO_ERROR;
				4477	inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
				4478	if (status == U_BUFFER_OVERFLOW_ERROR) {
				4479	status = U_ZERO_ERROR;
				4480	delete[] inputChars;
				4481	inputCapacity = inputLength + 1;
				4482	inputChars = new char[inputCapacity];
				4483	matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
				4484	}
				4485	utext_openUTF8(&inputText, inputChars, inputLength, &status);
				4486
				4487	//
				4488	// Run the test, check for expected match/don't match result.
				4489	//
				4490	RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
				4491	UBool found = testMat->find();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4492	UBool expected = false;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4493	if (fields[2].indexOf(UChar_y) >=0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4494	expected = true;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4495	}
				4496	if (expected != found) {
				4497	errln("line %d: Expected %smatch, got %smatch",
				4498	lineNum, expected?"":"no ", found?"":"no " );
				4499	delete testMat;
				4500	delete testPat;
				4501	continue;
				4502	}
				4503
				4504	// Don't try to check expected results if there is no match.
				4505	// (Some have stuff in the expected fields)
				4506	if (!found) {
				4507	delete testMat;
				4508	delete testPat;
				4509	continue;
				4510	}
				4511
				4512	//
				4513	// Interpret the Perl expression from the fourth field of the data file,
				4514	// building up an ICU string from the results of the ICU match.
				4515	// The Perl expression will contain references to the results of
				4516	// a regex match, including the matched string, capture group strings,
				4517	// group starting and ending indices, etc.
				4518	//
				4519	UnicodeString resultString;
				4520	UnicodeString perlExpr = fields[3];
				4521
				4522	while (perlExpr.length() > 0) {
				4523	groupsMat->reset(perlExpr);
				4524	cgMat->reset(perlExpr);
				4525
				4526	if (perlExpr.startsWith("$&")) {
				4527	resultString.append(testMat->group(status));
				4528	perlExpr.remove(0, 2);
				4529	}
				4530
				4531	else if (groupsMat->lookingAt(status)) {
				4532	// $-[0] $+[2] etc.
				4533	UnicodeString digitString = groupsMat->group(2, status);
				4534	int32_t t = 0;
				4535	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
				4536	UnicodeString plusOrMinus = groupsMat->group(1, status);
				4537	int32_t matchPosition;
				4538	if (plusOrMinus.compare("+") == 0) {
				4539	matchPosition = testMat->end(groupNum, status);
				4540	} else {
				4541	matchPosition = testMat->start(groupNum, status);
				4542	}
				4543	if (matchPosition != -1) {
				4544	ICU_Utility::appendNumber(resultString, matchPosition);
				4545	}
				4546	perlExpr.remove(0, groupsMat->end(status));
				4547	}
				4548
				4549	else if (cgMat->lookingAt(status)) {
				4550	// $1, $2, $3, etc.
				4551	UnicodeString digitString = cgMat->group(1, status);
				4552	int32_t t = 0;
				4553	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
				4554	if (U_SUCCESS(status)) {
				4555	resultString.append(testMat->group(groupNum, status));
				4556	status = U_ZERO_ERROR;
				4557	}
				4558	perlExpr.remove(0, cgMat->end(status));
				4559	}
				4560
				4561	else if (perlExpr.startsWith("@-")) {
				4562	int32_t i;
				4563	for (i=0; i<=testMat->groupCount(); i++) {
				4564	if (i>0) {
				4565	resultString.append(" ");
				4566	}
				4567	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
				4568	}
				4569	perlExpr.remove(0, 2);
				4570	}
				4571
				4572	else if (perlExpr.startsWith("@+")) {
				4573	int32_t i;
				4574	for (i=0; i<=testMat->groupCount(); i++) {
				4575	if (i>0) {
				4576	resultString.append(" ");
				4577	}
				4578	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
				4579	}
				4580	perlExpr.remove(0, 2);
				4581	}
				4582
				4583	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
				4584	// or as an escaped sequence (e.g. \n)
				4585	if (perlExpr.length() > 1) {
				4586	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
				4587	}
				4588	UChar c = perlExpr.charAt(0);
				4589	switch (c) {
				4590	case 'n': c = '\n'; break;
				4591	// add any other escape sequences that show up in the test expected results.
				4592	}
				4593	resultString.append(c);
				4594	perlExpr.remove(0, 1);
				4595	}
				4596
				4597	else {
				4598	// Any characters from the perl expression that we don't explicitly
				4599	// recognize before here are assumed to be literals and copied
				4600	// as-is to the expected results.
				4601	resultString.append(perlExpr.charAt(0));
				4602	perlExpr.remove(0, 1);
				4603	}
				4604
				4605	if (U_FAILURE(status)) {
				4606	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
				4607	break;
				4608	}
				4609	}
				4610
				4611	//
				4612	// Expected Results Compare
				4613	//
				4614	UnicodeString expectedS(fields[4]);
				4615	expectedS.findAndReplace(nulnulSrc, nulnul);
				4616	expectedS.findAndReplace(ffffSrc, ffff);
				4617	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
				4618
				4619
				4620	if (expectedS.compare(resultString) != 0) {
				4621	err("Line %d: Incorrect perl expression results.", lineNum);
				4622	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
				4623	}
				4624
				4625	delete testMat;
				4626	delete testPat;
				4627	}
				4628
				4629	//
				4630	// All done. Clean up allocated stuff.
				4631	//
				4632	delete cgMat;
				4633	delete cgPat;
				4634
				4635	delete groupsMat;
				4636	delete groupsPat;
				4637
				4638	delete flagMat;
				4639	delete flagPat;
				4640
				4641	delete lineMat;
				4642	delete linePat;
				4643
				4644	delete fieldPat;
				4645	delete [] testData;
				4646
				4647	utext_close(&patternText);
				4648	utext_close(&inputText);
				4649
				4650	delete [] patternChars;
				4651	delete [] inputChars;
				4652
				4653
				4654	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
				4655
				4656	}
				4657
				4658
				4659	//--------------------------------------------------------------
				4660	//
				4661	// Bug6149 Verify limits to heap expansion for backtrack stack.
				4662	// Use this pattern,
				4663	// "(a?){1,8000000}"
				4664	// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
				4665	// This test is likely to be fragile, as further optimizations stop
				4666	// more cases of pointless looping in the match engine.
				4667	//
				4668	//---------------------------------------------------------------
				4669	void RegexTest::Bug6149() {
				4670	UnicodeString pattern("(a?){1,8000000}");
				4671	UnicodeString s("xyz");
				4672	uint32_t flags = 0;
				4673	UErrorCode status = U_ZERO_ERROR;
				4674
				4675	RegexMatcher matcher(pattern, s, flags, status);
				4676	UBool result = false;
				4677	REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4678	REGEX_ASSERT(result == false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4679	}
				4680
				4681
				4682	//
				4683	// Callbacks() Test the callback function.
				4684	// When set, callbacks occur periodically during matching operations,
				4685	// giving the application code the ability to abort the operation
				4686	// before it's normal completion.
				4687	//
				4688
				4689	struct callBackContext {
				4690	RegexTest *test;
				4691	int32_t maxCalls;
				4692	int32_t numCalls;
				4693	int32_t lastSteps;
				4694	void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
				4695	};
				4696
				4697	U_CDECL_BEGIN
				4698	static UBool U_CALLCONV
				4699	testCallBackFn(const void *context, int32_t steps) {
				4700	callBackContext info = (callBackContext )context;
				4701	if (info->lastSteps+1 != steps) {
				4702	info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
				4703	}
				4704	info->lastSteps = steps;
				4705	info->numCalls++;
				4706	return (info->numCalls < info->maxCalls);
				4707	}
				4708	U_CDECL_END
				4709
				4710	void RegexTest::Callbacks() {
				4711	{
				4712	// Getter returns NULLs if no callback has been set
				4713
				4714	// The variables that the getter will fill in.
				4715	// Init to non-null values so that the action of the getter can be seen.
				4716	const void *returnedContext = &returnedContext;
				4717	URegexMatchCallback *returnedFn = &testCallBackFn;
				4718
				4719	UErrorCode status = U_ZERO_ERROR;
				4720	RegexMatcher matcher("x", 0, status);
				4721	REGEX_CHECK_STATUS;
				4722	matcher.getMatchCallback(returnedFn, returnedContext, status);
				4723	REGEX_CHECK_STATUS;
				4724	REGEX_ASSERT(returnedFn == NULL);
				4725	REGEX_ASSERT(returnedContext == NULL);
				4726	}
				4727
				4728	{
				4729	// Set and Get work
				4730	callBackContext cbInfo = {this, 0, 0, 0};
				4731	const void *returnedContext;
				4732	URegexMatchCallback *returnedFn;
				4733	UErrorCode status = U_ZERO_ERROR;
				4734	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
				4735	REGEX_CHECK_STATUS;
				4736	matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
				4737	REGEX_CHECK_STATUS;
				4738	matcher.getMatchCallback(returnedFn, returnedContext, status);
				4739	REGEX_CHECK_STATUS;
				4740	REGEX_ASSERT(returnedFn == testCallBackFn);
				4741	REGEX_ASSERT(returnedContext == &cbInfo);
				4742
				4743	// A short-running match shouldn't invoke the callback
				4744	status = U_ZERO_ERROR;
				4745	cbInfo.reset(1);
				4746	UnicodeString s = "xxx";
				4747	matcher.reset(s);
				4748	REGEX_ASSERT(matcher.matches(status));
				4749	REGEX_CHECK_STATUS;
				4750	REGEX_ASSERT(cbInfo.numCalls == 0);
				4751
				4752	// A medium-length match that runs long enough to invoke the
				4753	// callback, but not so long that the callback aborts it.
				4754	status = U_ZERO_ERROR;
				4755	cbInfo.reset(4);
				4756	s = "aaaaaaaaaaaaaaaaaaab";
				4757	matcher.reset(s);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4758	REGEX_ASSERT(matcher.matches(status)==false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4759	REGEX_CHECK_STATUS;
				4760	REGEX_ASSERT(cbInfo.numCalls > 0);
				4761
				4762	// A longer running match that the callback function will abort.
				4763	status = U_ZERO_ERROR;
				4764	cbInfo.reset(4);
				4765	s = "aaaaaaaaaaaaaaaaaaaaaaab";
				4766	matcher.reset(s);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4767	REGEX_ASSERT(matcher.matches(status)==false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4768	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
				4769	REGEX_ASSERT(cbInfo.numCalls == 4);
				4770
				4771	// A longer running find that the callback function will abort.
				4772	status = U_ZERO_ERROR;
				4773	cbInfo.reset(4);
				4774	s = "aaaaaaaaaaaaaaaaaaaaaaab";
				4775	matcher.reset(s);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4776	REGEX_ASSERT(matcher.find(status)==false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4777	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
				4778	REGEX_ASSERT(cbInfo.numCalls == 4);
				4779	}
				4780
				4781
				4782	}
				4783
				4784
				4785	//
				4786	// FindProgressCallbacks() Test the find "progress" callback function.
				4787	// When set, the find progress callback will be invoked during a find operations
				4788	// after each return from a match attempt, giving the application the opportunity
				4789	// to terminate a long-running find operation before it's normal completion.
				4790	//
				4791
				4792	struct progressCallBackContext {
				4793	RegexTest *test;
				4794	int64_t lastIndex;
				4795	int32_t maxCalls;
				4796	int32_t numCalls;
				4797	void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
				4798	};
				4799
				4800	// call-back function for find().
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4801	// Return true to continue the find().
				4802	// Return false to stop the find().
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4803	U_CDECL_BEGIN
				4804	static UBool U_CALLCONV
				4805	testProgressCallBackFn(const void *context, int64_t matchIndex) {
				4806	progressCallBackContext info = (progressCallBackContext )context;
				4807	info->numCalls++;
				4808	info->lastIndex = matchIndex;
				4809	// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
				4810	return (info->numCalls < info->maxCalls);
				4811	}
				4812	U_CDECL_END
				4813
				4814	void RegexTest::FindProgressCallbacks() {
				4815	{
				4816	// Getter returns NULLs if no callback has been set
				4817
				4818	// The variables that the getter will fill in.
				4819	// Init to non-null values so that the action of the getter can be seen.
				4820	const void *returnedContext = &returnedContext;
				4821	URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
				4822
				4823	UErrorCode status = U_ZERO_ERROR;
				4824	RegexMatcher matcher("x", 0, status);
				4825	REGEX_CHECK_STATUS;
				4826	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
				4827	REGEX_CHECK_STATUS;
				4828	REGEX_ASSERT(returnedFn == NULL);
				4829	REGEX_ASSERT(returnedContext == NULL);
				4830	}
				4831
				4832	{
				4833	// Set and Get work
				4834	progressCallBackContext cbInfo = {this, 0, 0, 0};
				4835	const void *returnedContext;
				4836	URegexFindProgressCallback *returnedFn;
				4837	UErrorCode status = U_ZERO_ERROR;
				4838	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
				4839	REGEX_CHECK_STATUS;
				4840	matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
				4841	REGEX_CHECK_STATUS;
				4842	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
				4843	REGEX_CHECK_STATUS;
				4844	REGEX_ASSERT(returnedFn == testProgressCallBackFn);
				4845	REGEX_ASSERT(returnedContext == &cbInfo);
				4846
				4847	// A find that matches on the initial position does NOT invoke the callback.
				4848	status = U_ZERO_ERROR;
				4849	cbInfo.reset(100);
				4850	UnicodeString s = "aaxxx";
				4851	matcher.reset(s);
				4852	#if 0
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4853	matcher.setTrace(true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4854	#endif
				4855	REGEX_ASSERT(matcher.find(0, status));
				4856	REGEX_CHECK_STATUS;
				4857	REGEX_ASSERT(cbInfo.numCalls == 0);
				4858
				4859	// A medium running find() that causes matcher.find() to invoke our callback for each index,
				4860	// but not so many times that we interrupt the operation.
				4861	status = U_ZERO_ERROR;
				4862	s = "aaaaaaaaaaaaaaaaaaab";
				4863	cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
				4864	matcher.reset(s);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4865	REGEX_ASSERT(matcher.find(0, status)==false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4866	REGEX_CHECK_STATUS;
				4867	REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
				4868
				4869	// A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
				4870	status = U_ZERO_ERROR;
				4871	UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
				4872	cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
				4873	matcher.reset(s1);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4874	REGEX_ASSERT(matcher.find(0, status)==false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4875	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
				4876	REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
				4877
				4878	// Now a match that will succeed, but after an interruption
				4879	status = U_ZERO_ERROR;
				4880	UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
				4881	cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
				4882	matcher.reset(s2);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4883	REGEX_ASSERT(matcher.find(0, status)==false);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4884	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
				4885	// Now retry the match from where left off
				4886	cbInfo.maxCalls = 100; // No callback limit
				4887	status = U_ZERO_ERROR;
				4888	REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
				4889	REGEX_CHECK_STATUS;
				4890	}
				4891
				4892
				4893	}
				4894
				4895
				4896	//---------------------------------------------------------------------------
				4897	//
				4898	// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
				4899	// UTexts. The pure-C implementation of UText
				4900	// has no mutable backing stores, but we can
				4901	// use UnicodeString here to test the functionality.
				4902	//
				4903	//---------------------------------------------------------------------------
				4904	void RegexTest::PreAllocatedUTextCAPI () {
				4905	UErrorCode status = U_ZERO_ERROR;
				4906	URegularExpression *re;
				4907	UText patternText = UTEXT_INITIALIZER;
				4908	UnicodeString buffer;
				4909	UText bufferText = UTEXT_INITIALIZER;
				4910
				4911	utext_openUnicodeString(&bufferText, &buffer, &status);
				4912
				4913	/*
				4914	* getText() and getUText()
				4915	*/
				4916	{
				4917	UText text1 = UTEXT_INITIALIZER;
				4918	UText text2 = UTEXT_INITIALIZER;
				4919	UChar text2Chars[20];
				4920	UText *resultText;
				4921
				4922	status = U_ZERO_ERROR;
				4923	regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
				4924	regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
				4925	u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
				4926	utext_openUChars(&text2, text2Chars, -1, &status);
				4927
				4928	regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
				4929	re = uregex_openUText(&patternText, 0, NULL, &status);
				4930
				4931	/* First set a UText */
				4932	uregex_setUText(re, &text1, &status);
				4933	resultText = uregex_getUText(re, &bufferText, &status);
				4934	REGEX_CHECK_STATUS;
				4935	REGEX_ASSERT(resultText == &bufferText);
				4936	utext_setNativeIndex(resultText, 0);
				4937	utext_setNativeIndex(&text1, 0);
				4938	REGEX_ASSERT(testUTextEqual(resultText, &text1));
				4939
				4940	resultText = uregex_getUText(re, &bufferText, &status);
				4941	REGEX_CHECK_STATUS;
				4942	REGEX_ASSERT(resultText == &bufferText);
				4943	utext_setNativeIndex(resultText, 0);
				4944	utext_setNativeIndex(&text1, 0);
				4945	REGEX_ASSERT(testUTextEqual(resultText, &text1));
				4946
				4947	/* Then set a UChar * */
				4948	uregex_setText(re, text2Chars, 7, &status);
				4949	resultText = uregex_getUText(re, &bufferText, &status);
				4950	REGEX_CHECK_STATUS;
				4951	REGEX_ASSERT(resultText == &bufferText);
				4952	utext_setNativeIndex(resultText, 0);
				4953	utext_setNativeIndex(&text2, 0);
				4954	REGEX_ASSERT(testUTextEqual(resultText, &text2));
				4955
				4956	uregex_close(re);
				4957	utext_close(&text1);
				4958	utext_close(&text2);
				4959	}
				4960
				4961	/*
				4962	* group()
				4963	*/
				4964	{
				4965	UChar text1[80];
				4966	UText *actual;
				4967	UBool result;
				4968	int64_t length = 0;
				4969
				4970	u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
				4971	// 012345678901234567890123456789012345678901234567
				4972	// 0 1 2 3 4
				4973
				4974	status = U_ZERO_ERROR;
				4975	re = uregex_openC("abc(.*?)def", 0, NULL, &status);
				4976	REGEX_CHECK_STATUS;
				4977
				4978	uregex_setText(re, text1, -1, &status);
				4979	result = uregex_find(re, 0, &status);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	4980	REGEX_ASSERT(result==true);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	4981
				4982	/* Capture Group 0, the full match. Should succeed. "abc interior def" */
				4983	status = U_ZERO_ERROR;
				4984	actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
				4985	REGEX_CHECK_STATUS;
				4986	REGEX_ASSERT(actual == &bufferText);
				4987	REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
				4988	REGEX_ASSERT(length == 16);
				4989	REGEX_ASSERT(utext_nativeLength(actual) == 47);
				4990
				4991	/* Capture group #1. Should succeed, matching " interior ". */
				4992	status = U_ZERO_ERROR;
				4993	actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
				4994	REGEX_CHECK_STATUS;
				4995	REGEX_ASSERT(actual == &bufferText);
				4996	REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
				4997	REGEX_ASSERT(length == 10);
				4998	REGEX_ASSERT(utext_nativeLength(actual) == 47);
				4999
				5000	/* Capture group out of range. Error. */
				5001	status = U_ZERO_ERROR;
				5002	actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
				5003	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				5004	REGEX_ASSERT(actual == &bufferText);
				5005	uregex_close(re);
				5006
				5007	}
				5008
				5009	/*
				5010	* replaceFirst()
				5011	*/
				5012	{
				5013	UChar text1[80];
				5014	UChar text2[80];
				5015	UText replText = UTEXT_INITIALIZER;
				5016	UText *result;
				5017	status = U_ZERO_ERROR;
				5018	utext_openUnicodeString(&bufferText, &buffer, &status);
				5019
				5020	status = U_ZERO_ERROR;
				5021	u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
				5022	u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
				5023	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
				5024
				5025	re = uregex_openC("x(.*?)x", 0, NULL, &status);
				5026	REGEX_CHECK_STATUS;
				5027
				5028	/* Normal case, with match */
				5029	uregex_setText(re, text1, -1, &status);
				5030	REGEX_CHECK_STATUS;
				5031	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
				5032	REGEX_CHECK_STATUS;
				5033	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
				5034	REGEX_CHECK_STATUS;
				5035	REGEX_ASSERT(result == &bufferText);
				5036	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
				5037
				5038	/* No match. Text should copy to output with no changes. */
				5039	uregex_setText(re, text2, -1, &status);
				5040	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
				5041	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
				5042	REGEX_CHECK_STATUS;
				5043	REGEX_ASSERT(result == &bufferText);
				5044	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
				5045
				5046	/* Unicode escapes */
				5047	uregex_setText(re, text1, -1, &status);
				5048	regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
				5049	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
				5050	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
				5051	REGEX_CHECK_STATUS;
				5052	REGEX_ASSERT(result == &bufferText);
				5053	REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
				5054
				5055	uregex_close(re);
				5056	utext_close(&replText);
				5057	}
				5058
				5059
				5060	/*
				5061	* replaceAll()
				5062	*/
				5063	{
				5064	UChar text1[80];
				5065	UChar text2[80];
				5066	UText replText = UTEXT_INITIALIZER;
				5067	UText *result;
				5068
				5069	status = U_ZERO_ERROR;
				5070	u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
				5071	u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
				5072	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
				5073
				5074	re = uregex_openC("x(.*?)x", 0, NULL, &status);
				5075	REGEX_CHECK_STATUS;
				5076
				5077	/* Normal case, with match */
				5078	uregex_setText(re, text1, -1, &status);
				5079	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
				5080	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
				5081	REGEX_CHECK_STATUS;
				5082	REGEX_ASSERT(result == &bufferText);
				5083	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
				5084
				5085	/* No match. Text should copy to output with no changes. */
				5086	uregex_setText(re, text2, -1, &status);
				5087	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
				5088	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
				5089	REGEX_CHECK_STATUS;
				5090	REGEX_ASSERT(result == &bufferText);
				5091	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
				5092
				5093	uregex_close(re);
				5094	utext_close(&replText);
				5095	}
				5096
				5097
				5098	/*
				5099	* splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
				5100	* so we don't need to test it here.
				5101	*/
				5102
				5103	utext_close(&bufferText);
				5104	utext_close(&patternText);
				5105	}
				5106
				5107
				5108	//--------------------------------------------------------------
				5109	//
				5110	// NamedCapture Check basic named capture group functionality
				5111	//
				5112	//--------------------------------------------------------------
				5113	void RegexTest::NamedCapture() {
				5114	UErrorCode status = U_ZERO_ERROR;
				5115	RegexPattern *pat = RegexPattern::compile(UnicodeString(
				5116	"abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
				5117	REGEX_CHECK_STATUS;
				5118	int32_t group = pat->groupNumberFromName("five", -1, status);
				5119	REGEX_CHECK_STATUS;
				5120	REGEX_ASSERT(5 == group);
				5121	group = pat->groupNumberFromName("three", -1, status);
				5122	REGEX_CHECK_STATUS;
				5123	REGEX_ASSERT(3 == group);
				5124
				5125	status = U_ZERO_ERROR;
				5126	group = pat->groupNumberFromName(UnicodeString("six"), status);
				5127	REGEX_CHECK_STATUS;
				5128	REGEX_ASSERT(6 == group);
				5129
				5130	status = U_ZERO_ERROR;
				5131	group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
				5132	U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5133
				5134	status = U_ZERO_ERROR;
				5135
				5136	// After copying a pattern, named capture should still work in the copy.
				5137	RegexPattern copiedPat = new RegexPattern(pat);
				5138	REGEX_ASSERT(copiedPat == pat);
				5139	delete pat; pat = NULL; // Delete original, copy should have no references back to it.
				5140
				5141	group = copiedPat->groupNumberFromName("five", -1, status);
				5142	REGEX_CHECK_STATUS;
				5143	REGEX_ASSERT(5 == group);
				5144	group = copiedPat->groupNumberFromName("three", -1, status);
				5145	REGEX_CHECK_STATUS;
				5146	REGEX_ASSERT(3 == group);
				5147	delete copiedPat;
				5148
				5149	// ReplaceAll with named capture group.
				5150	status = U_ZERO_ERROR;
				5151	UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
				5152	RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
				5153	REGEX_CHECK_STATUS;
				5154	// m.pattern().dumpPattern();
				5155	UnicodeString replacedText = m->replaceAll("'${mid}'", status);
				5156	REGEX_CHECK_STATUS;
				5157	REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
				5158	delete m;
				5159
				5160	// ReplaceAll, allowed capture group numbers.
				5161	text = UnicodeString("abcmxyz");
				5162	m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
				5163	REGEX_CHECK_STATUS;
				5164
				5165	status = U_ZERO_ERROR;
				5166	replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
				5167	REGEX_CHECK_STATUS;
				5168	REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
				5169
				5170	status = U_ZERO_ERROR;
				5171	replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
				5172	REGEX_CHECK_STATUS;
				5173	REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
				5174
				5175	status = U_ZERO_ERROR;
				5176	replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
				5177	REGEX_CHECK_STATUS;
				5178	REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
				5179
				5180	status = U_ZERO_ERROR;
				5181	replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
				5182	REGEX_CHECK_STATUS;
				5183	REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
				5184
				5185	status = U_ZERO_ERROR;
				5186	replacedText = m->replaceAll(UnicodeString("<$3>"), status);
				5187	REGEX_CHECK_STATUS;
				5188	REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
				5189
				5190	status = U_ZERO_ERROR;
				5191	replacedText = m->replaceAll(UnicodeString("<$4>"), status);
				5192	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				5193
				5194	status = U_ZERO_ERROR;
				5195	replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
				5196	REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
				5197	REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
				5198
				5199	status = U_ZERO_ERROR;
				5200	replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
				5201	REGEX_CHECK_STATUS; // that push group num out of range.
				5202	REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
				5203
				5204	status = U_ZERO_ERROR;
				5205	replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
				5206	REGEX_CHECK_STATUS;
				5207	REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
				5208
				5209	status = U_ZERO_ERROR;
				5210	replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
				5211	REGEX_CHECK_STATUS;
				5212	REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
				5213
				5214	status = U_ZERO_ERROR;
				5215	replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
				5216	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5217
				5218	status = U_ZERO_ERROR;
				5219	replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
				5220	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5221
				5222	status = U_ZERO_ERROR;
				5223	replacedText = m->replaceAll(UnicodeString("<${one"), status);
				5224	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5225
				5226	status = U_ZERO_ERROR;
				5227	replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
				5228	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5229
				5230	delete m;
				5231
				5232	// Repeat the above replaceAll() tests using the plain C API, which
				5233	// has a separate implementation internally.
				5234	// TODO: factor out the test data.
				5235
				5236	status = U_ZERO_ERROR;
				5237	URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
				5238	REGEX_CHECK_STATUS;
				5239	text = UnicodeString("abcmxyz");
				5240	uregex_setText(re, text.getBuffer(), text.length(), &status);
				5241	REGEX_CHECK_STATUS;
				5242
				5243	UChar resultBuf[100];
				5244	int32_t resultLength;
				5245	UnicodeString repl;
				5246
				5247	status = U_ZERO_ERROR;
				5248	repl = UnicodeString("<$0>");
				5249	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5250	REGEX_CHECK_STATUS;
				5251	REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
				5252
				5253	status = U_ZERO_ERROR;
				5254	repl = UnicodeString("<$1>");
				5255	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5256	REGEX_CHECK_STATUS;
				5257	REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
				5258
				5259	status = U_ZERO_ERROR;
				5260	repl = UnicodeString("<${one}>");
				5261	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5262	REGEX_CHECK_STATUS;
				5263	REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
				5264
				5265	status = U_ZERO_ERROR;
				5266	repl = UnicodeString("<$2>");
				5267	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5268	REGEX_CHECK_STATUS;
				5269	REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
				5270
				5271	status = U_ZERO_ERROR;
				5272	repl = UnicodeString("<$3>");
				5273	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5274	REGEX_CHECK_STATUS;
				5275	REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
				5276
				5277	status = U_ZERO_ERROR;
				5278	repl = UnicodeString("<$4>");
				5279	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5280	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
				5281
				5282	status = U_ZERO_ERROR;
				5283	repl = UnicodeString("<$04>");
				5284	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5285	REGEX_CHECK_STATUS;
				5286	REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
				5287
				5288	status = U_ZERO_ERROR;
				5289	repl = UnicodeString("<$000016>");
				5290	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5291	REGEX_CHECK_STATUS;
				5292	REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
				5293
				5294	status = U_ZERO_ERROR;
				5295	repl = UnicodeString("<$3$2$1${one}>");
				5296	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5297	REGEX_CHECK_STATUS;
				5298	REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
				5299
				5300	status = U_ZERO_ERROR;
				5301	repl = UnicodeString("$3$2$1${one}");
				5302	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5303	REGEX_CHECK_STATUS;
				5304	REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
				5305
				5306	status = U_ZERO_ERROR;
				5307	repl = UnicodeString("<${noSuchName}>");
				5308	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5309	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5310
				5311	status = U_ZERO_ERROR;
				5312	repl = UnicodeString("<${invalid-name}>");
				5313	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5314	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5315
				5316	status = U_ZERO_ERROR;
				5317	repl = UnicodeString("<${one");
				5318	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5319	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5320
				5321	status = U_ZERO_ERROR;
				5322	repl = UnicodeString("$not a capture group");
				5323	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
				5324	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
				5325
				5326	uregex_close(re);
				5327	}
				5328
				5329	//--------------------------------------------------------------
				5330	//
				5331	// NamedCaptureLimits Patterns with huge numbers of named capture groups.
				5332	// The point is not so much what the exact limit is,
				5333	// but that a largish number doesn't hit bad non-linear performance,
				5334	// and that exceeding the limit fails cleanly.
				5335	//
				5336	//--------------------------------------------------------------
				5337	void RegexTest::NamedCaptureLimits() {
				5338	if (quick) {
				5339	logln("Skipping test. Runs in exhuastive mode only.");
				5340	return;
				5341	}
				5342	const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
				5343	const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
				5344	char nnbuf[100];
				5345	UnicodeString pattern;
				5346	int32_t nn;
				5347
				5348	for (nn=1; nn<goodLimit; nn++) {
				5349	sprintf(nnbuf, "(?<nn%d>)", nn);
				5350	pattern.append(UnicodeString(nnbuf, -1, US_INV));
				5351	}
				5352	UErrorCode status = U_ZERO_ERROR;
				5353	RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
				5354	REGEX_CHECK_STATUS;
				5355	for (nn=1; nn<goodLimit; nn++) {
				5356	sprintf(nnbuf, "nn%d", nn);
				5357	int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
				5358	REGEX_ASSERT(nn == groupNum);
				5359	if (nn != groupNum) {
				5360	break;
				5361	}
				5362	}
				5363	delete pat;
				5364
				5365	pattern.remove();
				5366	for (nn=1; nn<failLimit; nn++) {
				5367	sprintf(nnbuf, "(?<nn%d>)", nn);
				5368	pattern.append(UnicodeString(nnbuf, -1, US_INV));
				5369	}
				5370	status = U_ZERO_ERROR;
				5371	pat = RegexPattern::compile(pattern, 0, status);
				5372	REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
				5373	delete pat;
				5374	}
				5375
				5376
				5377	//--------------------------------------------------------------
				5378	//
				5379	// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
				5380	//
				5381	//---------------------------------------------------------------
				5382	void RegexTest::Bug7651() {
				5383	UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?\|(https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|\\$[A-Za-z]+)");
				5384	// The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
				5385	// It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
				5386	UnicodeString pattern2("((https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?\|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|\\$[A-Za-z]+)");
				5387	UnicodeString s("#ff @abcd This is test");
				5388	RegexPattern *REPattern = NULL;
				5389	RegexMatcher *REMatcher = NULL;
				5390	UErrorCode status = U_ZERO_ERROR;
				5391	UParseError pe;
				5392
				5393	REPattern = RegexPattern::compile(pattern1, 0, pe, status);
				5394	REGEX_CHECK_STATUS;
				5395	REMatcher = REPattern->matcher(s, status);
				5396	REGEX_CHECK_STATUS;
				5397	REGEX_ASSERT(REMatcher->find());
				5398	REGEX_ASSERT(REMatcher->start(status) == 0);
				5399	delete REPattern;
				5400	delete REMatcher;
				5401	status = U_ZERO_ERROR;
				5402
				5403	REPattern = RegexPattern::compile(pattern2, 0, pe, status);
				5404	REGEX_CHECK_STATUS;
				5405	REMatcher = REPattern->matcher(s, status);
				5406	REGEX_CHECK_STATUS;
				5407	REGEX_ASSERT(REMatcher->find());
				5408	REGEX_ASSERT(REMatcher->start(status) == 0);
				5409	delete REPattern;
				5410	delete REMatcher;
				5411	status = U_ZERO_ERROR;
				5412	}
				5413
				5414	void RegexTest::Bug7740() {
				5415	UErrorCode status = U_ZERO_ERROR;
				5416	UnicodeString pattern = "(a)";
				5417	UnicodeString text = "abcdef";
				5418	RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
				5419	REGEX_CHECK_STATUS;
				5420	REGEX_ASSERT(m->lookingAt(status));
				5421	REGEX_CHECK_STATUS;
				5422	status = U_ILLEGAL_ARGUMENT_ERROR;
				5423	UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
				5424	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
				5425	REGEX_ASSERT(s == "");
				5426	delete m;
				5427	}
				5428
				5429	// Bug 8479: was crashing whith a Bogus UnicodeString as input.
				5430
				5431	void RegexTest::Bug8479() {
				5432	UErrorCode status = U_ZERO_ERROR;
				5433
				5434	RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL\|UREGEX_CASE_INSENSITIVE, status);
				5435	REGEX_CHECK_STATUS;
				5436	if (U_SUCCESS(status))
				5437	{
				5438	UnicodeString str;
				5439	str.setToBogus();
				5440	pMatcher->reset(str);
				5441	status = U_ZERO_ERROR;
				5442	pMatcher->matches(status);
				5443	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
				5444	delete pMatcher;
				5445	}
				5446	}
				5447
				5448
				5449	// Bug 7029
				5450	void RegexTest::Bug7029() {
				5451	UErrorCode status = U_ZERO_ERROR;
				5452
				5453	RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
				5454	UnicodeString text = "abc.def";
				5455	UnicodeString splits[10];
				5456	REGEX_CHECK_STATUS;
				5457	int32_t numFields = pMatcher->split(text, splits, 10, status);
				5458	REGEX_CHECK_STATUS;
				5459	REGEX_ASSERT(numFields == 8);
				5460	delete pMatcher;
				5461	}
				5462
				5463	// Bug 9283
				5464	// This test is checking for the existence of any supplemental characters that case-fold
				5465	// to a bmp character.
				5466	//
				5467	// At the time of this writing there are none. If any should appear in a subsequent release
				5468	// of Unicode, the code in regular expressions compilation that determines the longest
				5469	// possible match for a literal string will need to be enhanced.
				5470	//
				5471	// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
				5472	// for details on what to do in case of a failure of this test.
				5473	//
				5474	void RegexTest::Bug9283() {
				5475	#if !UCONFIG_NO_NORMALIZATION
				5476	UErrorCode status = U_ZERO_ERROR;
				5477	UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
				5478	REGEX_CHECK_STATUS;
				5479	int32_t index;
				5480	UChar32 c;
				5481	for (index=0; ; index++) {
				5482	c = supplementalsWithCaseFolding.charAt(index);
				5483	if (c == -1) {
				5484	break;
				5485	}
				5486	UnicodeString cf = UnicodeString(c).foldCase();
				5487	REGEX_ASSERT(cf.length() >= 2);
				5488	}
				5489	#endif /* #if !UCONFIG_NO_NORMALIZATION */
				5490	}
				5491
				5492
				5493	void RegexTest::CheckInvBufSize() {
				5494	if(inv_next>=INV_BUFSIZ) {
				5495	errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
				5496	__FILE__, INV_BUFSIZ, inv_next);
				5497	} else {
				5498	logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
				5499	}
				5500	}
				5501
				5502
				5503	void RegexTest::Bug10459() {
				5504	UErrorCode status = U_ZERO_ERROR;
				5505	UnicodeString patternString("(txt)");
				5506	UnicodeString txtString("txt");
				5507
				5508	UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
				5509	REGEX_CHECK_STATUS;
				5510	UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
				5511	REGEX_CHECK_STATUS;
				5512
				5513	URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
				5514	REGEX_CHECK_STATUS;
				5515
				5516	uregex_setUText(icu_re, utext_txt, &status);
				5517	REGEX_CHECK_STATUS;
				5518
				5519	// The bug was that calling uregex_group() before doing a matching operation
				5520	// was causing a segfault. Only for Regular Expressions created from UText.
				5521	// It should set an U_REGEX_INVALID_STATE.
				5522
				5523	UChar buf[100];
				5524	int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
				5525	REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
				5526	REGEX_ASSERT(len == 0);
				5527
				5528	uregex_close(icu_re);
				5529	utext_close(utext_pat);
				5530	utext_close(utext_txt);
				5531	}
				5532
				5533	void RegexTest::TestCaseInsensitiveStarters() {
				5534	// Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
				5535	// become stale because of new Unicode characters.
				5536	// If it is stale, rerun the generation tool
				5537	// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
				5538	// and replace the embedded data in i18n/regexcmp.cpp
				5539
				5540	for (UChar32 cp=0; cp<=0x10ffff; cp++) {
				5541	if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
				5542	continue;
				5543	}
				5544	UnicodeSet s(cp, cp);
				5545	s.closeOver(USET_CASE_INSENSITIVE);
				5546	UnicodeSetIterator setIter(s);
				5547	while (setIter.next()) {
				5548	if (!setIter.isString()) {
				5549	continue;
				5550	}
				5551	const UnicodeString &str = setIter.getString();
				5552	UChar32 firstChar = str.char32At(0);
				5553	UnicodeSet starters;
				5554	RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
				5555	if (!starters.contains(cp)) {
				5556	errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
				5557	return;
				5558	}
				5559	}
				5560	}
				5561	}
				5562
				5563
				5564	void RegexTest::TestBug11049() {
				5565	// Original bug report: pattern with match start consisting of one of several individual characters,
				5566	// and the text being matched ending with a supplementary character. find() would read past the
				5567	// end of the input text when searching for potential match starting points.
				5568
				5569	// To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
				5570	// detect the bad read.
				5571
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	5572	TestCase11049("A\|B\|C", "a string \\ud800\\udc00", false, __LINE__);
				5573	TestCase11049("A\|B\|C", "string matches at end C", true, __LINE__);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5574
				5575	// Test again with a pattern starting with a single character,
				5576	// which takes a different code path than starting with an OR expression,
				5577	// but with similar logic.
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	5578	TestCase11049("C", "a string \\ud800\\udc00", false, __LINE__);
				5579	TestCase11049("C", "string matches at end C", true, __LINE__);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	5580	}
				5581
				5582	// Run a single test case from TestBug11049(). Internal function.
				5583	void RegexTest::TestCase11049(const char pattern, const char data, UBool expectMatch, int32_t lineNumber) {
				5584	UErrorCode status = U_ZERO_ERROR;
				5585	UnicodeString patternString = UnicodeString(pattern).unescape();
				5586	LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
				5587
				5588	UnicodeString dataString = UnicodeString(data).unescape();
				5589	UChar *exactBuffer = new UChar[dataString.length()];
				5590	dataString.extract(exactBuffer, dataString.length(), status);
				5591	UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
				5592
				5593	LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
				5594	REGEX_CHECK_STATUS;
				5595	matcher->reset(ut);
				5596	UBool result = matcher->find();
				5597	if (result != expectMatch) {
				5598	errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
				5599	__FILE__, lineNumber, expectMatch, result, pattern, data);
				5600	}
				5601
				5602	// Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
				5603	// off-by-one on find() with match at the last code point.
				5604	// Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
				5605	// because string.unescape() will only shrink it.
				5606	char * utf8Buffer = new char[uprv_strlen(data)+1];
				5607	u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
				5608	REGEX_CHECK_STATUS;
				5609	ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
				5610	REGEX_CHECK_STATUS;
				5611	matcher->reset(ut);
				5612	result = matcher->find();
				5613	if (result != expectMatch) {
				5614	errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
				5615	__FILE__, lineNumber, expectMatch, result, pattern, data);
				5616	}
				5617	delete [] utf8Buffer;
				5618
				5619	utext_close(ut);
				5620	delete [] exactBuffer;
				5621	}
				5622
				5623
				5624	void RegexTest::TestBug11371() {
				5625	if (quick) {
				5626	logln("Skipping test. Runs in exhuastive mode only.");
				5627	return;
				5628	}
				5629	UErrorCode status = U_ZERO_ERROR;
				5630	UnicodeString patternString;
				5631
				5632	for (int i=0; i<8000000; i++) {
				5633	patternString.append(UnicodeString("()"));
				5634	}
				5635	LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
				5636	if (status != U_REGEX_PATTERN_TOO_BIG) {
				5637	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
				5638	__FILE__, __LINE__, u_errorName(status));
				5639	}
				5640
				5641	status = U_ZERO_ERROR;
				5642	patternString = "(";
				5643	for (int i=0; i<20000000; i++) {
				5644	patternString.append(UnicodeString("A++"));
				5645	}
				5646	patternString.append(UnicodeString("){0}B++"));
				5647	LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
				5648	if (status != U_REGEX_PATTERN_TOO_BIG) {
				5649	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
				5650	__FILE__, __LINE__, u_errorName(status));
				5651	}
				5652
				5653	// Pattern with too much string data, such that string indexes overflow operand data field size
				5654	// in compiled instruction.
				5655	status = U_ZERO_ERROR;
				5656	patternString = "";
				5657	while (patternString.length() < 0x00ffffff) {
				5658	patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
				5659	}
				5660	patternString.append(UnicodeString("X? trailing string"));
				5661	LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
				5662	if (status != U_REGEX_PATTERN_TOO_BIG) {
				5663	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
				5664	__FILE__, __LINE__, u_errorName(status));
				5665	}
				5666	}
				5667
				5668	void RegexTest::TestBug11480() {
				5669	// C API, get capture group of a group that does not participate in the match.
				5670	// (Returns a zero length string, with nul termination,
				5671	// indistinguishable from a group with a zero length match.)
				5672
				5673	UErrorCode status = U_ZERO_ERROR;
				5674	URegularExpression *re = uregex_openC("(A)\|(B)", 0, NULL, &status);
				5675	REGEX_CHECK_STATUS;
				5676	UnicodeString text = UNICODE_STRING_SIMPLE("A");
				5677	uregex_setText(re, text.getBuffer(), text.length(), &status);
				5678	REGEX_CHECK_STATUS;
				5679	REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
				5680	UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
				5681	int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
				5682	REGEX_ASSERT(length == 0);
				5683	REGEX_ASSERT(buf[0] == 13);
				5684	REGEX_ASSERT(buf[1] == 0);
				5685	REGEX_ASSERT(buf[2] == 13);
				5686	uregex_close(re);
				5687
				5688	// UText C++ API, length of match is 0 for non-participating matches.
				5689	UText ut = UTEXT_INITIALIZER;
				5690	utext_openUnicodeString(&ut, &text, &status);
				5691	RegexMatcher matcher(UnicodeString("(A)\|(B)"), 0, status);
				5692	REGEX_CHECK_STATUS;
				5693	matcher.reset(&ut);
				5694	REGEX_ASSERT(matcher.lookingAt(0, status));
				5695
				5696	// UText C++ API, Capture group 1 matches "A", position 0, length 1.
				5697	int64_t groupLen = -666;
				5698	UText group = UTEXT_INITIALIZER;
				5699	matcher.group(1, &group, groupLen, status);
				5700	REGEX_CHECK_STATUS;
				5701	REGEX_ASSERT(groupLen == 1);
				5702	REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
				5703
				5704	// Capture group 2, the (B), does not participate in the match.
				5705	matcher.group(2, &group, groupLen, status);
				5706	REGEX_CHECK_STATUS;
				5707	REGEX_ASSERT(groupLen == 0);
				5708	REGEX_ASSERT(matcher.start(2, status) == -1);
				5709	REGEX_CHECK_STATUS;
				5710	}
				5711
				5712	void RegexTest::TestBug12884() {
				5713	// setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
				5714	UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
				5715	UnicodeString text(u"hello");
				5716	UErrorCode status = U_ZERO_ERROR;
				5717	RegexMatcher m(pattern, text, 0, status);
				5718	REGEX_CHECK_STATUS;
				5719	m.setTimeLimit(5, status);
				5720	m.find(status);
				5721	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
				5722
				5723	// Non-greedy loops. They take a different code path during matching.
				5724	UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
				5725	status = U_ZERO_ERROR;
				5726	RegexMatcher ngM(ngPattern, text, 0, status);
				5727	REGEX_CHECK_STATUS;
				5728	ngM.setTimeLimit(5, status);
				5729	ngM.find(status);
				5730	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
				5731
				5732	// UText, wrapping non-UTF-16 text, also takes a different execution path.
				5733	StringPiece text8(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
				5734	"carácter, sin importar la plataforma, sin importar el programa,"
				5735	"sin importar el idioma.");
				5736	status = U_ZERO_ERROR;
				5737	LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
				5738	REGEX_CHECK_STATUS;
				5739	m.reset(ut.getAlias());
				5740	m.find(status);
				5741	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
				5742
				5743	status = U_ZERO_ERROR;
				5744	ngM.reset(ut.getAlias());
				5745	ngM.find(status);
				5746	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
				5747	}
				5748
				5749	// Bug 13631. A find() of a pattern with a zero length look-behind assertions
				5750	// can cause a read past the end of the input text.
				5751	// The failure is seen when running this test with Clang's Address Sanitizer.
				5752
				5753	void RegexTest::TestBug13631() {
				5754	const UChar *pats[] = { u"(?<!^)",
				5755	u"(?<=^)",
				5756	nullptr
				5757	};
				5758	for (const UChar *pat=pats; pat; ++pat) {
				5759	UErrorCode status = U_ZERO_ERROR;
				5760	UnicodeString upat(*pat);
				5761	RegexMatcher matcher(upat, 0, status);
				5762	const UChar s =u'a';
				5763	UText *ut = utext_openUChars(nullptr, &s, 1, &status);
				5764	REGEX_CHECK_STATUS;
				5765	matcher.reset(ut);
				5766	while (matcher.find()) {
				5767	}
				5768	utext_close(ut);
				5769	}
				5770	}
				5771
				5772	// Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
				5773	// where a following group specification would be expected.
				5774	// Failure shows when running the test under Clang's Address Sanitizer.
				5775
				5776	void RegexTest::TestBug13632() {
				5777	UErrorCode status = U_ZERO_ERROR;
				5778	URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
				5779	const char16_t *sourceString = u"Hello, world.";
				5780	uregex_setText(re, sourceString, u_strlen(sourceString), &status);
				5781
				5782	const int32_t destCap = 20;
				5783	char16_t dest[destCap] = {};
				5784	const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
				5785	uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
				5786
				5787	assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
				5788	uregex_close(re);
				5789	}
				5790
				5791	void RegexTest::TestBug20359() {
				5792	// The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
				5793	// pairs. (Enter and exit pattern literal quote mode). Logic was correct.
				5794	// Changed implementation to loop instead of recursing.
				5795
				5796	UnicodeString pattern;
				5797	for (int i=0; i<50000; ++i) {
				5798	pattern += u"\\Q\\E";
				5799	}
				5800	pattern += u"x";
				5801
				5802	UErrorCode status = U_ZERO_ERROR;
				5803	LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
				5804	0, nullptr, &status));
				5805	assertSuccess(WHERE, status);
				5806
				5807	// We have passed the point where the bug crashed. The following is a small sanity
				5808	// check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
				5809
				5810	uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
				5811	assertSuccess(WHERE, status);
				5812	assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
				5813	assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
				5814	assertSuccess(WHERE, status);
				5815	}
				5816
				5817
				5818	void RegexTest::TestBug20863() {
				5819	// Test that patterns with a large number of named capture groups work correctly.
				5820	//
				5821	// The ticket was not for a bug per se, but to reduce memory usage by using lazy
				5822	// construction of the map from capture names to numbers, and decreasing the
				5823	// default size of the map.
				5824
				5825	constexpr int GROUP_COUNT = 2000;
				5826	std::vector<UnicodeString> groupNames;
				5827	for (int32_t i=0; i<GROUP_COUNT; ++i) {
				5828	UnicodeString name;
				5829	name.append(u"name");
				5830	name.append(Int64ToUnicodeString(i));
				5831	groupNames.push_back(name);
				5832	}
				5833
				5834	UnicodeString patternString;
				5835	for (UnicodeString name: groupNames) {
				5836	patternString.append(u"(?<");
				5837	patternString.append(name);
				5838	patternString.append(u">.)");
				5839	}
				5840
				5841	UErrorCode status = U_ZERO_ERROR;
				5842	UParseError pe;
				5843	LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
				5844	if (!assertSuccess(WHERE, status)) {
				5845	return;
				5846	}
				5847
				5848	for (int32_t i=0; i<GROUP_COUNT; ++i) {
				5849	int32_t group = pattern->groupNumberFromName(groupNames[i], status);
				5850	if (!assertSuccess(WHERE, status)) {
				5851	return;
				5852	}
				5853	assertEquals(WHERE, i+1, group);
				5854	// Note: group 0 is the overall match; group 1 is the first separate capture group.
				5855	}
				5856
				5857	// Verify that assignment of patterns with various combinations of named capture work.
				5858	// Lazy creation of the internal named capture map changed the implementation logic here.
				5859	{
				5860	LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
				5861	LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
				5862	assertSuccess(WHERE, status);
				5863	assertFalse(WHERE, pat1 == pat2);
				5864	pat1 = pat2;
				5865	assertTrue(WHERE, pat1 == pat2);
				5866	assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
				5867	assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
				5868	assertSuccess(WHERE, status);
				5869	}
				5870
				5871	{
				5872	LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
				5873	LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
				5874	assertSuccess(WHERE, status);
				5875	assertFalse(WHERE, pat1 == pat2);
				5876	pat2 = pat1;
				5877	assertTrue(WHERE, pat1 == pat2);
				5878	assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
				5879	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
				5880	status = U_ZERO_ERROR;
				5881	assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
				5882	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
				5883	status = U_ZERO_ERROR;
				5884	}
				5885
				5886	{
				5887	LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
				5888	LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
				5889	assertSuccess(WHERE, status);
				5890	assertFalse(WHERE, pat1 == pat2);
				5891	pat2 = pat1;
				5892	assertTrue(WHERE, pat1 == pat2);
				5893	assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
				5894	assertSuccess(WHERE, status);
				5895	assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
				5896	assertSuccess(WHERE, status);
				5897	assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
				5898	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
				5899	status = U_ZERO_ERROR;
				5900	assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
				5901	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
				5902	status = U_ZERO_ERROR;
				5903	}
				5904
				5905	}
				5906
				5907
				5908	#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */