Blame - source/common/ustring.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 5804976ef97333ca79632652ef2df34d5ce8813e [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	******************************************************************************
				5	*
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	6	* Copyright (C) 1998-2016, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	7	* Corporation and others. All Rights Reserved.
				8	*
				9	******************************************************************************
				10	*
				11	* File ustring.cpp
				12	*
				13	* Modification History:
				14	*
				15	* Date Name Description
				16	* 12/07/98 bertrand Creation.
				17	******************************************************************************
				18	*/
				19
				20	#include "unicode/utypes.h"
				21	#include "unicode/putil.h"
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	22	#include "unicode/uchar.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	23	#include "unicode/ustring.h"
				24	#include "unicode/utf16.h"
				25	#include "cstring.h"
				26	#include "cwchar.h"
				27	#include "cmemory.h"
				28	#include "ustr_imp.h"
				29
				30	/* ANSI string.h - style functions ------------------------------------------ */
				31
				32	/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
				33	#define U_BMP_MAX 0xffff
				34
				35	/* Forward binary string search functions ----------------------------------- */
				36
				37	/*
				38	* Test if a substring match inside a string is at code point boundaries.
				39	* All pointers refer to the same buffer.
				40	* The limit pointer may be NULL, all others must be real pointers.
				41	*/
				42	static inline UBool
				43	isMatchAtCPBoundary(const UChar start, const UChar match, const UChar matchLimit, const UChar limit) {
				44	if(U16_IS_TRAIL(match) && start!=match && U16_IS_LEAD((match-1))) {
				45	/* the leading edge of the match is in the middle of a surrogate pair */
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	46	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	47	}
Frank Tang	f90543d	2020-10-30 19:02:04 -0700	[diff] [blame]	48	if(U16_IS_LEAD((matchLimit-1)) && matchLimit!=limit && U16_IS_TRAIL(matchLimit)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	49	/* the trailing edge of the match is in the middle of a surrogate pair */
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	50	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	51	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	52	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	53	}
				54
				55	U_CAPI UChar * U_EXPORT2
				56	u_strFindFirst(const UChar *s, int32_t length,
				57	const UChar *sub, int32_t subLength) {
				58	const UChar start, p, q, subLimit;
				59	UChar c, cs, cq;
				60
				61	if(sub==NULL \|\| subLength<-1) {
				62	return (UChar *)s;
				63	}
				64	if(s==NULL \|\| length<-1) {
				65	return NULL;
				66	}
				67
				68	start=s;
				69
				70	if(length<0 && subLength<0) {
				71	/* both strings are NUL-terminated */
				72	if((cs=*sub++)==0) {
				73	return (UChar *)s;
				74	}
				75	if(*sub==0 && !U16_IS_SURROGATE(cs)) {
				76	/* the substring consists of a single, non-surrogate BMP code point */
				77	return u_strchr(s, cs);
				78	}
				79
				80	while((c=*s++)!=0) {
				81	if(c==cs) {
				82	/* found first substring UChar, compare rest */
				83	p=s;
				84	q=sub;
				85	for(;;) {
				86	if((cq=*q)==0) {
				87	if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
				88	return (UChar )(s-1); / well-formed match */
				89	} else {
				90	break; /* no match because surrogate pair is split */
				91	}
				92	}
				93	if((c=*p)==0) {
				94	return NULL; /* no match, and none possible after s */
				95	}
				96	if(c!=cq) {
				97	break; /* no match */
				98	}
				99	++p;
				100	++q;
				101	}
				102	}
				103	}
				104
				105	/* not found */
				106	return NULL;
				107	}
				108
				109	if(subLength<0) {
				110	subLength=u_strlen(sub);
				111	}
				112	if(subLength==0) {
				113	return (UChar *)s;
				114	}
				115
				116	/* get sub[0] to search for it fast */
				117	cs=*sub++;
				118	--subLength;
				119	subLimit=sub+subLength;
				120
				121	if(subLength==0 && !U16_IS_SURROGATE(cs)) {
				122	/* the substring consists of a single, non-surrogate BMP code point */
				123	return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
				124	}
				125
				126	if(length<0) {
				127	/* s is NUL-terminated */
				128	while((c=*s++)!=0) {
				129	if(c==cs) {
				130	/* found first substring UChar, compare rest */
				131	p=s;
				132	q=sub;
				133	for(;;) {
				134	if(q==subLimit) {
				135	if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
				136	return (UChar )(s-1); / well-formed match */
				137	} else {
				138	break; /* no match because surrogate pair is split */
				139	}
				140	}
				141	if((c=*p)==0) {
				142	return NULL; /* no match, and none possible after s */
				143	}
				144	if(c!=*q) {
				145	break; /* no match */
				146	}
				147	++p;
				148	++q;
				149	}
				150	}
				151	}
				152	} else {
				153	const UChar limit, preLimit;
				154
				155	/* subLength was decremented above */
				156	if(length<=subLength) {
				157	return NULL; /* s is shorter than sub */
				158	}
				159
				160	limit=s+length;
				161
				162	/* the substring must start before preLimit */
				163	preLimit=limit-subLength;
				164
				165	while(s!=preLimit) {
				166	c=*s++;
				167	if(c==cs) {
				168	/* found first substring UChar, compare rest */
				169	p=s;
				170	q=sub;
				171	for(;;) {
				172	if(q==subLimit) {
				173	if(isMatchAtCPBoundary(start, s-1, p, limit)) {
				174	return (UChar )(s-1); / well-formed match */
				175	} else {
				176	break; /* no match because surrogate pair is split */
				177	}
				178	}
				179	if(p!=q) {
				180	break; /* no match */
				181	}
				182	++p;
				183	++q;
				184	}
				185	}
				186	}
				187	}
				188
				189	/* not found */
				190	return NULL;
				191	}
				192
				193	U_CAPI UChar * U_EXPORT2
				194	u_strstr(const UChar s, const UChar substring) {
				195	return u_strFindFirst(s, -1, substring, -1);
				196	}
				197
				198	U_CAPI UChar * U_EXPORT2
				199	u_strchr(const UChar *s, UChar c) {
				200	if(U16_IS_SURROGATE(c)) {
				201	/* make sure to not find half of a surrogate pair */
				202	return u_strFindFirst(s, -1, &c, 1);
				203	} else {
				204	UChar cs;
				205
				206	/* trivial search for a BMP code point */
				207	for(;;) {
				208	if((cs=*s)==c) {
				209	return (UChar *)s;
				210	}
				211	if(cs==0) {
				212	return NULL;
				213	}
				214	++s;
				215	}
				216	}
				217	}
				218
				219	U_CAPI UChar * U_EXPORT2
				220	u_strchr32(const UChar *s, UChar32 c) {
				221	if((uint32_t)c<=U_BMP_MAX) {
				222	/* find BMP code point */
				223	return u_strchr(s, (UChar)c);
				224	} else if((uint32_t)c<=UCHAR_MAX_VALUE) {
				225	/* find supplementary code point as surrogate pair */
				226	UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
				227
				228	while((cs=*s++)!=0) {
				229	if(cs==lead && *s==trail) {
				230	return (UChar *)(s-1);
				231	}
				232	}
				233	return NULL;
				234	} else {
				235	/* not a Unicode code point, not findable */
				236	return NULL;
				237	}
				238	}
				239
				240	U_CAPI UChar * U_EXPORT2
				241	u_memchr(const UChar *s, UChar c, int32_t count) {
				242	if(count<=0) {
				243	return NULL; /* no string */
				244	} else if(U16_IS_SURROGATE(c)) {
				245	/* make sure to not find half of a surrogate pair */
				246	return u_strFindFirst(s, count, &c, 1);
				247	} else {
				248	/* trivial search for a BMP code point */
				249	const UChar *limit=s+count;
				250	do {
				251	if(*s==c) {
				252	return (UChar *)s;
				253	}
				254	} while(++s!=limit);
				255	return NULL;
				256	}
				257	}
				258
				259	U_CAPI UChar * U_EXPORT2
				260	u_memchr32(const UChar *s, UChar32 c, int32_t count) {
				261	if((uint32_t)c<=U_BMP_MAX) {
				262	/* find BMP code point */
				263	return u_memchr(s, (UChar)c, count);
				264	} else if(count<2) {
				265	/* too short for a surrogate pair */
				266	return NULL;
				267	} else if((uint32_t)c<=UCHAR_MAX_VALUE) {
				268	/* find supplementary code point as surrogate pair */
				269	const UChar limit=s+count-1; / -1 so that we do not need a separate check for the trail unit */
				270	UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
				271
				272	do {
				273	if(s==lead && (s+1)==trail) {
				274	return (UChar *)s;
				275	}
				276	} while(++s!=limit);
				277	return NULL;
				278	} else {
				279	/* not a Unicode code point, not findable */
				280	return NULL;
				281	}
				282	}
				283
				284	/* Backward binary string search functions ---------------------------------- */
				285
				286	U_CAPI UChar * U_EXPORT2
				287	u_strFindLast(const UChar *s, int32_t length,
				288	const UChar *sub, int32_t subLength) {
				289	const UChar start, limit, p, q, *subLimit;
				290	UChar c, cs;
				291
				292	if(sub==NULL \|\| subLength<-1) {
				293	return (UChar *)s;
				294	}
				295	if(s==NULL \|\| length<-1) {
				296	return NULL;
				297	}
				298
				299	/*
				300	* This implementation is more lazy than the one for u_strFindFirst():
				301	* There is no special search code for NUL-terminated strings.
				302	* It does not seem to be worth it for searching substrings to
				303	* search forward and find all matches like in u_strrchr() and similar.
				304	* Therefore, we simply get both string lengths and search backward.
				305	*
				306	* markus 2002oct23
				307	*/
				308
				309	if(subLength<0) {
				310	subLength=u_strlen(sub);
				311	}
				312	if(subLength==0) {
				313	return (UChar *)s;
				314	}
				315
				316	/* get sub[subLength-1] to search for it fast */
				317	subLimit=sub+subLength;
				318	cs=*(--subLimit);
				319	--subLength;
				320
				321	if(subLength==0 && !U16_IS_SURROGATE(cs)) {
				322	/* the substring consists of a single, non-surrogate BMP code point */
				323	return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
				324	}
				325
				326	if(length<0) {
				327	length=u_strlen(s);
				328	}
				329
				330	/* subLength was decremented above */
				331	if(length<=subLength) {
				332	return NULL; /* s is shorter than sub */
				333	}
				334
				335	start=s;
				336	limit=s+length;
				337
				338	/* the substring must start no later than s+subLength */
				339	s+=subLength;
				340
				341	while(s!=limit) {
				342	c=*(--limit);
				343	if(c==cs) {
				344	/* found last substring UChar, compare rest */
				345	p=limit;
				346	q=subLimit;
				347	for(;;) {
				348	if(q==sub) {
				349	if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
				350	return (UChar )p; / well-formed match */
				351	} else {
				352	break; /* no match because surrogate pair is split */
				353	}
				354	}
				355	if((--p)!=(--q)) {
				356	break; /* no match */
				357	}
				358	}
				359	}
				360	}
				361
				362	/* not found */
				363	return NULL;
				364	}
				365
				366	U_CAPI UChar * U_EXPORT2
				367	u_strrstr(const UChar s, const UChar substring) {
				368	return u_strFindLast(s, -1, substring, -1);
				369	}
				370
				371	U_CAPI UChar * U_EXPORT2
				372	u_strrchr(const UChar *s, UChar c) {
				373	if(U16_IS_SURROGATE(c)) {
				374	/* make sure to not find half of a surrogate pair */
				375	return u_strFindLast(s, -1, &c, 1);
				376	} else {
				377	const UChar *result=NULL;
				378	UChar cs;
				379
				380	/* trivial search for a BMP code point */
				381	for(;;) {
				382	if((cs=*s)==c) {
				383	result=s;
				384	}
				385	if(cs==0) {
				386	return (UChar *)result;
				387	}
				388	++s;
				389	}
				390	}
				391	}
				392
				393	U_CAPI UChar * U_EXPORT2
				394	u_strrchr32(const UChar *s, UChar32 c) {
				395	if((uint32_t)c<=U_BMP_MAX) {
				396	/* find BMP code point */
				397	return u_strrchr(s, (UChar)c);
				398	} else if((uint32_t)c<=UCHAR_MAX_VALUE) {
				399	/* find supplementary code point as surrogate pair */
				400	const UChar *result=NULL;
				401	UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
				402
				403	while((cs=*s++)!=0) {
				404	if(cs==lead && *s==trail) {
				405	result=s-1;
				406	}
				407	}
				408	return (UChar *)result;
				409	} else {
				410	/* not a Unicode code point, not findable */
				411	return NULL;
				412	}
				413	}
				414
				415	U_CAPI UChar * U_EXPORT2
				416	u_memrchr(const UChar *s, UChar c, int32_t count) {
				417	if(count<=0) {
				418	return NULL; /* no string */
				419	} else if(U16_IS_SURROGATE(c)) {
				420	/* make sure to not find half of a surrogate pair */
				421	return u_strFindLast(s, count, &c, 1);
				422	} else {
				423	/* trivial search for a BMP code point */
				424	const UChar *limit=s+count;
				425	do {
				426	if(*(--limit)==c) {
				427	return (UChar *)limit;
				428	}
				429	} while(s!=limit);
				430	return NULL;
				431	}
				432	}
				433
				434	U_CAPI UChar * U_EXPORT2
				435	u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
				436	if((uint32_t)c<=U_BMP_MAX) {
				437	/* find BMP code point */
				438	return u_memrchr(s, (UChar)c, count);
				439	} else if(count<2) {
				440	/* too short for a surrogate pair */
				441	return NULL;
				442	} else if((uint32_t)c<=UCHAR_MAX_VALUE) {
				443	/* find supplementary code point as surrogate pair */
				444	const UChar *limit=s+count-1;
				445	UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
				446
				447	do {
				448	if(limit==trail && (limit-1)==lead) {
				449	return (UChar *)(limit-1);
				450	}
				451	} while(s!=--limit);
				452	return NULL;
				453	} else {
				454	/* not a Unicode code point, not findable */
				455	return NULL;
				456	}
				457	}
				458
				459	/* Tokenization functions --------------------------------------------------- */
				460
				461	/*
				462	* Match each code point in a string against each code point in the matchSet.
				463	* Return the index of the first string code point that
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	464	* is (polarity==true) or is not (false) contained in the matchSet.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	465	* Return -(string length)-1 if there is no such code point.
				466	*/
				467	static int32_t
				468	_matchFromSet(const UChar string, const UChar matchSet, UBool polarity) {
				469	int32_t matchLen, matchBMPLen, strItr, matchItr;
				470	UChar32 stringCh, matchCh;
				471	UChar c, c2;
				472
				473	/* first part of matchSet contains only BMP code points */
				474	matchBMPLen = 0;
				475	while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
				476	++matchBMPLen;
				477	}
				478
				479	/* second part of matchSet contains BMP and supplementary code points */
				480	matchLen = matchBMPLen;
				481	while(matchSet[matchLen] != 0) {
				482	++matchLen;
				483	}
				484
				485	for(strItr = 0; (c = string[strItr]) != 0;) {
				486	++strItr;
				487	if(U16_IS_SINGLE(c)) {
				488	if(polarity) {
				489	for(matchItr = 0; matchItr < matchLen; ++matchItr) {
				490	if(c == matchSet[matchItr]) {
				491	return strItr - 1; /* one matches */
				492	}
				493	}
				494	} else {
				495	for(matchItr = 0; matchItr < matchLen; ++matchItr) {
				496	if(c == matchSet[matchItr]) {
				497	goto endloop;
				498	}
				499	}
				500	return strItr - 1; /* none matches */
				501	}
				502	} else {
				503	/*
				504	* No need to check for string length before U16_IS_TRAIL
				505	* because c2 could at worst be the terminating NUL.
				506	*/
				507	if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
				508	++strItr;
				509	stringCh = U16_GET_SUPPLEMENTARY(c, c2);
				510	} else {
				511	stringCh = c; /* unpaired trail surrogate */
				512	}
				513
				514	if(polarity) {
				515	for(matchItr = matchBMPLen; matchItr < matchLen;) {
				516	U16_NEXT(matchSet, matchItr, matchLen, matchCh);
				517	if(stringCh == matchCh) {
				518	return strItr - U16_LENGTH(stringCh); /* one matches */
				519	}
				520	}
				521	} else {
				522	for(matchItr = matchBMPLen; matchItr < matchLen;) {
				523	U16_NEXT(matchSet, matchItr, matchLen, matchCh);
				524	if(stringCh == matchCh) {
				525	goto endloop;
				526	}
				527	}
				528	return strItr - U16_LENGTH(stringCh); /* none matches */
				529	}
				530	}
				531	endloop:
				532	/* wish C had continue with labels like Java... */;
				533	}
				534
				535	/* Didn't find it. */
				536	return -strItr-1;
				537	}
				538
				539	/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
				540	U_CAPI UChar * U_EXPORT2
				541	u_strpbrk(const UChar string, const UChar matchSet)
				542	{
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	543	int32_t idx = _matchFromSet(string, matchSet, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	544	if(idx >= 0) {
				545	return (UChar *)string + idx;
				546	} else {
				547	return NULL;
				548	}
				549	}
				550
				551	/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
				552	U_CAPI int32_t U_EXPORT2
				553	u_strcspn(const UChar string, const UChar matchSet)
				554	{
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	555	int32_t idx = _matchFromSet(string, matchSet, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	556	if(idx >= 0) {
				557	return idx;
				558	} else {
				559	return -idx - 1; /* == u_strlen(string) */
				560	}
				561	}
				562
				563	/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
				564	U_CAPI int32_t U_EXPORT2
				565	u_strspn(const UChar string, const UChar matchSet)
				566	{
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	567	int32_t idx = _matchFromSet(string, matchSet, false);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	568	if(idx >= 0) {
				569	return idx;
				570	} else {
				571	return -idx - 1; /* == u_strlen(string) */
				572	}
				573	}
				574
				575	/* ----- Text manipulation functions --- */
				576
				577	U_CAPI UChar* U_EXPORT2
				578	u_strtok_r(UChar *src,
				579	const UChar *delim,
				580	UChar **saveState)
				581	{
				582	UChar *tokSource;
				583	UChar *nextToken;
				584	uint32_t nonDelimIdx;
				585
				586	/* If saveState is NULL, the user messed up. */
				587	if (src != NULL) {
				588	tokSource = src;
				589	saveState = src; / Set to "src" in case there are no delimiters */
				590	}
				591	else if (*saveState) {
				592	tokSource = *saveState;
				593	}
				594	else {
				595	/* src == NULL && saveState == NULL /
				596	/* This shouldn't happen. We already finished tokenizing. */
				597	return NULL;
				598	}
				599
				600	/* Skip initial delimiters */
				601	nonDelimIdx = u_strspn(tokSource, delim);
				602	tokSource = &tokSource[nonDelimIdx];
				603
				604	if (*tokSource) {
				605	nextToken = u_strpbrk(tokSource, delim);
				606	if (nextToken != NULL) {
				607	/* Create a token */
				608	*(nextToken++) = 0;
				609	*saveState = nextToken;
				610	return tokSource;
				611	}
				612	else if (*saveState) {
				613	/* Return the last token */
				614	*saveState = NULL;
				615	return tokSource;
				616	}
				617	}
				618	else {
				619	/* No tokens were found. Only delimiters were left. */
				620	*saveState = NULL;
				621	}
				622	return NULL;
				623	}
				624
				625	/* Miscellaneous functions -------------------------------------------------- */
				626
				627	U_CAPI UChar* U_EXPORT2
				628	u_strcat(UChar *dst,
				629	const UChar *src)
				630	{
				631	UChar anchor = dst; / save a pointer to start of dst */
				632
				633	while(dst != 0) { / To end of first string */
				634	++dst;
				635	}
				636	while(((dst++) = (src++)) != 0) { /* copy string 2 over */
				637	}
				638
				639	return anchor;
				640	}
				641
				642	U_CAPI UChar* U_EXPORT2
				643	u_strncat(UChar *dst,
				644	const UChar *src,
				645	int32_t n )
				646	{
				647	if(n > 0) {
				648	UChar anchor = dst; / save a pointer to start of dst */
				649
				650	while(dst != 0) { / To end of first string */
				651	++dst;
				652	}
				653	while((dst = src) != 0) { /* copy string 2 over */
				654	++dst;
				655	if(--n == 0) {
				656	*dst = 0;
				657	break;
				658	}
				659	++src;
				660	}
				661
				662	return anchor;
				663	} else {
				664	return dst;
				665	}
				666	}
				667
				668	/* ----- Text property functions --- */
				669
				670	U_CAPI int32_t U_EXPORT2
				671	u_strcmp(const UChar *s1,
				672	const UChar *s2)
				673	{
				674	UChar c1, c2;
				675
				676	for(;;) {
				677	c1=*s1++;
				678	c2=*s2++;
				679	if (c1 != c2 \|\| c1 == 0) {
				680	break;
				681	}
				682	}
				683	return (int32_t)c1 - (int32_t)c2;
				684	}
				685
				686	U_CFUNC int32_t U_EXPORT2
				687	uprv_strCompare(const UChar *s1, int32_t length1,
				688	const UChar *s2, int32_t length2,
				689	UBool strncmpStyle, UBool codePointOrder) {
				690	const UChar start1, start2, limit1, limit2;
				691	UChar c1, c2;
				692
				693	/* setup for fix-up */
				694	start1=s1;
				695	start2=s2;
				696
				697	/* compare identical prefixes - they do not need to be fixed up */
				698	if(length1<0 && length2<0) {
				699	/* strcmp style, both NUL-terminated */
				700	if(s1==s2) {
				701	return 0;
				702	}
				703
				704	for(;;) {
				705	c1=*s1;
				706	c2=*s2;
				707	if(c1!=c2) {
				708	break;
				709	}
				710	if(c1==0) {
				711	return 0;
				712	}
				713	++s1;
				714	++s2;
				715	}
				716
				717	/* setup for fix-up */
				718	limit1=limit2=NULL;
				719	} else if(strncmpStyle) {
				720	/* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
				721	if(s1==s2) {
				722	return 0;
				723	}
				724
				725	limit1=start1+length1;
				726
				727	for(;;) {
				728	/* both lengths are same, check only one limit */
				729	if(s1==limit1) {
				730	return 0;
				731	}
				732
				733	c1=*s1;
				734	c2=*s2;
				735	if(c1!=c2) {
				736	break;
				737	}
				738	if(c1==0) {
				739	return 0;
				740	}
				741	++s1;
				742	++s2;
				743	}
				744
				745	/* setup for fix-up */
				746	limit2=start2+length1; /* use length1 here, too, to enforce assumption */
				747	} else {
				748	/* memcmp/UnicodeString style, both length-specified */
				749	int32_t lengthResult;
				750
				751	if(length1<0) {
				752	length1=u_strlen(s1);
				753	}
				754	if(length2<0) {
				755	length2=u_strlen(s2);
				756	}
				757
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	758	/* limit1=start1+min(length1, length2) */
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	759	if(length1<length2) {
				760	lengthResult=-1;
				761	limit1=start1+length1;
				762	} else if(length1==length2) {
				763	lengthResult=0;
				764	limit1=start1+length1;
				765	} else /* length1>length2 */ {
				766	lengthResult=1;
				767	limit1=start1+length2;
				768	}
				769
				770	if(s1==s2) {
				771	return lengthResult;
				772	}
				773
				774	for(;;) {
				775	/* check pseudo-limit */
				776	if(s1==limit1) {
				777	return lengthResult;
				778	}
				779
				780	c1=*s1;
				781	c2=*s2;
				782	if(c1!=c2) {
				783	break;
				784	}
				785	++s1;
				786	++s2;
				787	}
				788
				789	/* setup for fix-up */
				790	limit1=start1+length1;
				791	limit2=start2+length2;
				792	}
				793
				794	/* if both values are in or above the surrogate range, fix them up */
				795	if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
				796	/* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
				797	if(
				798	(c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) \|\|
				799	(U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
				800	) {
				801	/* part of a surrogate pair, leave >=d800 */
				802	} else {
				803	/* BMP code point - may be surrogate code point - make <d800 */
				804	c1-=0x2800;
				805	}
				806
				807	if(
				808	(c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) \|\|
				809	(U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
				810	) {
				811	/* part of a surrogate pair, leave >=d800 */
				812	} else {
				813	/* BMP code point - may be surrogate code point - make <d800 */
				814	c2-=0x2800;
				815	}
				816	}
				817
				818	/* now c1 and c2 are in the requested (code unit or code point) order */
				819	return (int32_t)c1-(int32_t)c2;
				820	}
				821
				822	/*
				823	* Compare two strings as presented by UCharIterators.
				824	* Use code unit or code point order.
				825	* When the function returns, it is undefined where the iterators
				826	* have stopped.
				827	*/
				828	U_CAPI int32_t U_EXPORT2
				829	u_strCompareIter(UCharIterator iter1, UCharIterator iter2, UBool codePointOrder) {
				830	UChar32 c1, c2;
				831
				832	/* argument checking */
				833	if(iter1==NULL \|\| iter2==NULL) {
				834	return 0; /* bad arguments */
				835	}
				836	if(iter1==iter2) {
				837	return 0; /* identical iterators */
				838	}
				839
				840	/* reset iterators to start? */
				841	iter1->move(iter1, 0, UITER_START);
				842	iter2->move(iter2, 0, UITER_START);
				843
				844	/* compare identical prefixes - they do not need to be fixed up */
				845	for(;;) {
				846	c1=iter1->next(iter1);
				847	c2=iter2->next(iter2);
				848	if(c1!=c2) {
				849	break;
				850	}
				851	if(c1==-1) {
				852	return 0;
				853	}
				854	}
				855
				856	/* if both values are in or above the surrogate range, fix them up */
				857	if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
				858	/* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
				859	if(
				860	(c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) \|\|
				861	(U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
				862	) {
				863	/* part of a surrogate pair, leave >=d800 */
				864	} else {
				865	/* BMP code point - may be surrogate code point - make <d800 */
				866	c1-=0x2800;
				867	}
				868
				869	if(
				870	(c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) \|\|
				871	(U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
				872	) {
				873	/* part of a surrogate pair, leave >=d800 */
				874	} else {
				875	/* BMP code point - may be surrogate code point - make <d800 */
				876	c2-=0x2800;
				877	}
				878	}
				879
				880	/* now c1 and c2 are in the requested (code unit or code point) order */
				881	return (int32_t)c1-(int32_t)c2;
				882	}
				883
				884	#if 0
				885	/*
				886	* u_strCompareIter() does not leave the iterators _on_ the different units.
				887	* This is possible but would cost a few extra indirect function calls to back
				888	* up if the last unit (c1 or c2 respectively) was >=0.
				889	*
				890	* Consistently leaving them _behind_ the different units is not an option
				891	* because the current "unit" is the end of the string if that is reached,
				892	* and in such a case the iterator does not move.
				893	* For example, when comparing "ab" with "abc", both iterators rest _on_ the end
				894	* of their strings. Calling previous() on each does not move them to where
				895	* the comparison fails.
				896	*
				897	* So the simplest semantics is to not define where the iterators end up.
				898	*
				899	* The following fragment is part of what would need to be done for backing up.
				900	*/
				901	void fragment {
				902	/* iff a surrogate is part of a surrogate pair, leave >=d800 */
				903	if(c1<=0xdbff) {
				904	if(!U16_IS_TRAIL(iter1->current(iter1))) {
				905	/* lead surrogate code point - make <d800 */
				906	c1-=0x2800;
				907	}
				908	} else if(c1<=0xdfff) {
				909	int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
				910	iter1->previous(iter1); /* ==c1 */
				911	if(!U16_IS_LEAD(iter1->previous(iter1))) {
				912	/* trail surrogate code point - make <d800 */
				913	c1-=0x2800;
				914	}
				915	/* go back to behind where the difference is */
				916	iter1->move(iter1, idx, UITER_ZERO);
				917	} else /* 0xe000<=c1<=0xffff */ {
				918	/* BMP code point - make <d800 */
				919	c1-=0x2800;
				920	}
				921	}
				922	#endif
				923
				924	U_CAPI int32_t U_EXPORT2
				925	u_strCompare(const UChar *s1, int32_t length1,
				926	const UChar *s2, int32_t length2,
				927	UBool codePointOrder) {
				928	/* argument checking */
				929	if(s1==NULL \|\| length1<-1 \|\| s2==NULL \|\| length2<-1) {
				930	return 0;
				931	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	932	return uprv_strCompare(s1, length1, s2, length2, false, codePointOrder);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	933	}
				934
				935	/* String compare in code point order - u_strcmp() compares in code unit order. */
				936	U_CAPI int32_t U_EXPORT2
				937	u_strcmpCodePointOrder(const UChar s1, const UChar s2) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	938	return uprv_strCompare(s1, -1, s2, -1, false, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	939	}
				940
				941	U_CAPI int32_t U_EXPORT2
				942	u_strncmp(const UChar *s1,
				943	const UChar *s2,
				944	int32_t n)
				945	{
				946	if(n > 0) {
				947	int32_t rc;
				948	for(;;) {
				949	rc = (int32_t)s1 - (int32_t)s2;
				950	if(rc != 0 \|\| *s1 == 0 \|\| --n == 0) {
				951	return rc;
				952	}
				953	++s1;
				954	++s2;
				955	}
				956	} else {
				957	return 0;
				958	}
				959	}
				960
				961	U_CAPI int32_t U_EXPORT2
				962	u_strncmpCodePointOrder(const UChar s1, const UChar s2, int32_t n) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	963	return uprv_strCompare(s1, n, s2, n, true, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	964	}
				965
				966	U_CAPI UChar* U_EXPORT2
				967	u_strcpy(UChar *dst,
				968	const UChar *src)
				969	{
				970	UChar anchor = dst; / save a pointer to start of dst */
				971
				972	while(((dst++) = (src++)) != 0) { /* copy string 2 over */
				973	}
				974
				975	return anchor;
				976	}
				977
				978	U_CAPI UChar* U_EXPORT2
				979	u_strncpy(UChar *dst,
				980	const UChar *src,
				981	int32_t n)
				982	{
				983	UChar anchor = dst; / save a pointer to start of dst */
				984
				985	/* copy string 2 over */
				986	while(n > 0 && ((dst++) = (src++)) != 0) {
				987	--n;
				988	}
				989
				990	return anchor;
				991	}
				992
				993	U_CAPI int32_t U_EXPORT2
				994	u_strlen(const UChar *s)
				995	{
				996	#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	997	return (int32_t)uprv_wcslen((const wchar_t *)s);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	998	#else
				999	const UChar *t = s;
				1000	while(*t != 0) {
				1001	++t;
				1002	}
				1003	return t - s;
				1004	#endif
				1005	}
				1006
				1007	U_CAPI int32_t U_EXPORT2
				1008	u_countChar32(const UChar *s, int32_t length) {
				1009	int32_t count;
				1010
				1011	if(s==NULL \|\| length<-1) {
				1012	return 0;
				1013	}
				1014
				1015	count=0;
				1016	if(length>=0) {
				1017	while(length>0) {
				1018	++count;
				1019	if(U16_IS_LEAD(s) && length>=2 && U16_IS_TRAIL((s+1))) {
				1020	s+=2;
				1021	length-=2;
				1022	} else {
				1023	++s;
				1024	--length;
				1025	}
				1026	}
				1027	} else /* length==-1 */ {
				1028	UChar c;
				1029
				1030	for(;;) {
				1031	if((c=*s++)==0) {
				1032	break;
				1033	}
				1034	++count;
				1035
				1036	/*
				1037	* sufficient to look ahead one because of UTF-16;
				1038	* safe to look ahead one because at worst that would be the terminating NUL
				1039	*/
				1040	if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
				1041	++s;
				1042	}
				1043	}
				1044	}
				1045	return count;
				1046	}
				1047
				1048	U_CAPI UBool U_EXPORT2
				1049	u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
				1050
				1051	if(number<0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1052	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1053	}
				1054	if(s==NULL \|\| length<-1) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1055	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1056	}
				1057
				1058	if(length==-1) {
				1059	/* s is NUL-terminated */
				1060	UChar c;
				1061
				1062	/* count code points until they exceed */
				1063	for(;;) {
				1064	if((c=*s++)==0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1065	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1066	}
				1067	if(number==0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1068	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1069	}
				1070	if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
				1071	++s;
				1072	}
				1073	--number;
				1074	}
				1075	} else {
				1076	/* length>=0 known */
				1077	const UChar *limit;
				1078	int32_t maxSupplementary;
				1079
				1080	/* s contains at least (length+1)/2 code points: <=2 UChars per cp */
				1081	if(((length+1)/2)>number) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1082	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1083	}
				1084
				1085	/* check if s does not even contain enough UChars */
				1086	maxSupplementary=length-number;
				1087	if(maxSupplementary<=0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1088	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1089	}
				1090	/* there are maxSupplementary=length-number more UChars than asked-for code points */
				1091
				1092	/*
				1093	* count code points until they exceed and also check that there are
				1094	* no more than maxSupplementary supplementary code points (UChar pairs)
				1095	*/
				1096	limit=s+length;
				1097	for(;;) {
				1098	if(s==limit) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1099	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1100	}
				1101	if(number==0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1102	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1103	}
				1104	if(U16_IS_LEAD(s++) && s!=limit && U16_IS_TRAIL(s)) {
				1105	++s;
				1106	if(--maxSupplementary<=0) {
				1107	/* too many pairs - too few code points */
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1108	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1109	}
				1110	}
				1111	--number;
				1112	}
				1113	}
				1114	}
				1115
				1116	U_CAPI UChar * U_EXPORT2
				1117	u_memcpy(UChar dest, const UChar src, int32_t count) {
				1118	if(count > 0) {
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1119	uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1120	}
				1121	return dest;
				1122	}
				1123
				1124	U_CAPI UChar * U_EXPORT2
				1125	u_memmove(UChar dest, const UChar src, int32_t count) {
				1126	if(count > 0) {
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1127	uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1128	}
				1129	return dest;
				1130	}
				1131
				1132	U_CAPI UChar * U_EXPORT2
				1133	u_memset(UChar *dest, UChar c, int32_t count) {
				1134	if(count > 0) {
				1135	UChar *ptr = dest;
				1136	UChar *limit = dest + count;
				1137
				1138	while (ptr < limit) {
				1139	*(ptr++) = c;
				1140	}
				1141	}
				1142	return dest;
				1143	}
				1144
				1145	U_CAPI int32_t U_EXPORT2
				1146	u_memcmp(const UChar buf1, const UChar buf2, int32_t count) {
				1147	if(count > 0) {
				1148	const UChar *limit = buf1 + count;
				1149	int32_t result;
				1150
				1151	while (buf1 < limit) {
				1152	result = (int32_t)(uint16_t)buf1 - (int32_t)(uint16_t)buf2;
				1153	if (result != 0) {
				1154	return result;
				1155	}
				1156	buf1++;
				1157	buf2++;
				1158	}
				1159	}
				1160	return 0;
				1161	}
				1162
				1163	U_CAPI int32_t U_EXPORT2
				1164	u_memcmpCodePointOrder(const UChar s1, const UChar s2, int32_t count) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1165	return uprv_strCompare(s1, count, s2, count, false, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1166	}
				1167
				1168	/* u_unescape & support fns ------------------------------------------------- */
				1169
				1170	/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
				1171	static const UChar UNESCAPE_MAP[] = {
				1172	/" 0x22, 0x22 /
				1173	/' 0x27, 0x27 /
				1174	/? 0x3F, 0x3F /
				1175	/\ 0x5C, 0x5C /
				1176	/a/ 0x61, 0x07,
				1177	/b/ 0x62, 0x08,
				1178	/e/ 0x65, 0x1b,
				1179	/f/ 0x66, 0x0c,
				1180	/n/ 0x6E, 0x0a,
				1181	/r/ 0x72, 0x0d,
				1182	/t/ 0x74, 0x09,
				1183	/v/ 0x76, 0x0b
				1184	};
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1185	enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1186
				1187	/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1188	static int32_t _digit8(UChar c) {
				1189	if (c >= u'0' && c <= u'7') {
				1190	return c - u'0';
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1191	}
				1192	return -1;
				1193	}
				1194
				1195	/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1196	static int32_t _digit16(UChar c) {
				1197	if (c >= u'0' && c <= u'9') {
				1198	return c - u'0';
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1199	}
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1200	if (c >= u'A' && c <= u'F') {
				1201	return c - (u'A' - 10);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1202	}
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1203	if (c >= u'a' && c <= u'f') {
				1204	return c - (u'a' - 10);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1205	}
				1206	return -1;
				1207	}
				1208
				1209	/* Parse a single escape sequence. Although this method deals in
				1210	* UChars, it does not use C++ or UnicodeString. This allows it to
				1211	* be used from C contexts. */
				1212	U_CAPI UChar32 U_EXPORT2
				1213	u_unescapeAt(UNESCAPE_CHAR_AT charAt,
				1214	int32_t *offset,
				1215	int32_t length,
				1216	void *context) {
				1217
				1218	int32_t start = *offset;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1219	UChar32 c;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1220	UChar32 result = 0;
				1221	int8_t n = 0;
				1222	int8_t minDig = 0;
				1223	int8_t maxDig = 0;
				1224	int8_t bitsPerDigit = 4;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1225	int32_t dig;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1226	UBool braces = false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1227
				1228	/* Check that offset is in range */
				1229	if (offset < 0 \|\| offset >= length) {
				1230	goto err;
				1231	}
				1232
				1233	/* Fetch first UChar after '\\' */
				1234	c = charAt((*offset)++, context);
				1235
				1236	/* Convert hexadecimal and octal escapes */
				1237	switch (c) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1238	case u'u':
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1239	minDig = maxDig = 4;
				1240	break;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1241	case u'U':
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1242	minDig = maxDig = 8;
				1243	break;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1244	case u'x':
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1245	minDig = 1;
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1246	if (offset < length && charAt(offset, context) == u'{') {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1247	++(*offset);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1248	braces = true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1249	maxDig = 8;
				1250	} else {
				1251	maxDig = 2;
				1252	}
				1253	break;
				1254	default:
				1255	dig = _digit8(c);
				1256	if (dig >= 0) {
				1257	minDig = 1;
				1258	maxDig = 3;
				1259	n = 1; /* Already have first octal digit */
				1260	bitsPerDigit = 3;
				1261	result = dig;
				1262	}
				1263	break;
				1264	}
				1265	if (minDig != 0) {
				1266	while (*offset < length && n < maxDig) {
				1267	c = charAt(*offset, context);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1268	dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1269	if (dig < 0) {
				1270	break;
				1271	}
				1272	result = (result << bitsPerDigit) \| dig;
				1273	++(*offset);
				1274	++n;
				1275	}
				1276	if (n < minDig) {
				1277	goto err;
				1278	}
				1279	if (braces) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1280	if (c != u'}') {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1281	goto err;
				1282	}
				1283	++(*offset);
				1284	}
				1285	if (result < 0 \|\| result >= 0x110000) {
				1286	goto err;
				1287	}
				1288	/* If an escape sequence specifies a lead surrogate, see if
				1289	* there is a trail surrogate after it, either as an escape or
				1290	* as a literal. If so, join them up into a supplementary.
				1291	*/
				1292	if (*offset < length && U16_IS_LEAD(result)) {
				1293	int32_t ahead = *offset + 1;
				1294	c = charAt(*offset, context);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1295	if (c == u'\\' && ahead < length) {
				1296	// Calling ourselves recursively may cause a stack overflow if
				1297	// we have repeated escaped lead surrogates.
				1298	// Limit the length to 11 ("x{0000DFFF}") after ahead.
				1299	int32_t tailLimit = ahead + 11;
Frank Tang	f90543d	2020-10-30 19:02:04 -0700	[diff] [blame]	1300	if (tailLimit > length) {
				1301	tailLimit = length;
				1302	}
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1303	c = u_unescapeAt(charAt, &ahead, tailLimit, context);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1304	}
				1305	if (U16_IS_TRAIL(c)) {
				1306	*offset = ahead;
				1307	result = U16_GET_SUPPLEMENTARY(result, c);
				1308	}
				1309	}
				1310	return result;
				1311	}
				1312
				1313	/* Convert C-style escapes in table */
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1314	for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1315	if (c == UNESCAPE_MAP[i]) {
				1316	return UNESCAPE_MAP[i+1];
				1317	} else if (c < UNESCAPE_MAP[i]) {
				1318	break;
				1319	}
				1320	}
				1321
				1322	/* Map \cX to control-X: X & 0x1F */
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1323	if (c == u'c' && *offset < length) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1324	c = charAt((*offset)++, context);
				1325	if (U16_IS_LEAD(c) && *offset < length) {
				1326	UChar c2 = charAt(*offset, context);
				1327	if (U16_IS_TRAIL(c2)) {
				1328	++(*offset);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1329	c = U16_GET_SUPPLEMENTARY(c, c2);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1330	}
				1331	}
				1332	return 0x1F & c;
				1333	}
				1334
				1335	/* If no special forms are recognized, then consider
				1336	* the backslash to generically escape the next character.
				1337	* Deal with surrogate pairs. */
				1338	if (U16_IS_LEAD(c) && *offset < length) {
				1339	UChar c2 = charAt(*offset, context);
				1340	if (U16_IS_TRAIL(c2)) {
				1341	++(*offset);
				1342	return U16_GET_SUPPLEMENTARY(c, c2);
				1343	}
				1344	}
				1345	return c;
				1346
				1347	err:
				1348	/* Invalid escape sequence */
				1349	offset = start; / Reset to initial value */
				1350	return (UChar32)0xFFFFFFFF;
				1351	}
				1352
				1353	/* u_unescapeAt() callback to return a UChar from a char* */
				1354	static UChar U_CALLCONV
				1355	_charPtr_charAt(int32_t offset, void *context) {
				1356	UChar c16;
				1357	/* It would be more efficient to access the invariant tables
				1358	* directly but there is no API for that. */
				1359	u_charsToUChars(((char*) context) + offset, &c16, 1);
				1360	return c16;
				1361	}
				1362
				1363	/* Append an escape-free segment of the text; used by u_unescape() */
				1364	static void _appendUChars(UChar *dest, int32_t destCapacity,
				1365	const char *src, int32_t srcLen) {
				1366	if (destCapacity < 0) {
				1367	destCapacity = 0;
				1368	}
				1369	if (srcLen > destCapacity) {
				1370	srcLen = destCapacity;
				1371	}
				1372	u_charsToUChars(src, dest, srcLen);
				1373	}
				1374
				1375	/* Do an invariant conversion of char* -> UChar, with escape parsing /
				1376	U_CAPI int32_t U_EXPORT2
				1377	u_unescape(const char src, UChar dest, int32_t destCapacity) {
				1378	const char *segment = src;
				1379	int32_t i = 0;
				1380	char c;
				1381
				1382	while ((c=*src) != 0) {
				1383	/* '\\' intentionally written as compiler-specific
				1384	* character constant to correspond to compiler-specific
				1385	* char* constants. */
				1386	if (c == '\\') {
				1387	int32_t lenParsed = 0;
				1388	UChar32 c32;
				1389	if (src != segment) {
				1390	if (dest != NULL) {
				1391	_appendUChars(dest + i, destCapacity - i,
				1392	segment, (int32_t)(src - segment));
				1393	}
				1394	i += (int32_t)(src - segment);
				1395	}
				1396	++src; /* advance past '\\' */
				1397	c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
				1398	if (lenParsed == 0) {
				1399	goto err;
				1400	}
				1401	src += lenParsed; /* advance past escape seq. */
				1402	if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
				1403	U16_APPEND_UNSAFE(dest, i, c32);
				1404	} else {
				1405	i += U16_LENGTH(c32);
				1406	}
				1407	segment = src;
				1408	} else {
				1409	++src;
				1410	}
				1411	}
				1412	if (src != segment) {
				1413	if (dest != NULL) {
				1414	_appendUChars(dest + i, destCapacity - i,
				1415	segment, (int32_t)(src - segment));
				1416	}
				1417	i += (int32_t)(src - segment);
				1418	}
				1419	if (dest != NULL && i < destCapacity) {
				1420	dest[i] = 0;
				1421	}
				1422	return i;
				1423
				1424	err:
				1425	if (dest != NULL && destCapacity > 0) {
				1426	*dest = 0;
				1427	}
				1428	return 0;
				1429	}
				1430
				1431	/* NUL-termination of strings ----------------------------------------------- */
				1432
				1433	/**
				1434	* NUL-terminate a string no matter what its type.
				1435	* Set warning and error codes accordingly.
				1436	*/
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	1437	#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) UPRV_BLOCK_MACRO_BEGIN { \
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1438	if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
				1439	/* not a public function, so no complete argument checking */ \
				1440	\
				1441	if(length<0) { \
				1442	/* assume that the caller handles this */ \
				1443	} else if(length<destCapacity) { \
				1444	/* NUL-terminate the string, the NUL fits */ \
				1445	dest[length]=0; \
				1446	/* unset the not-terminated warning but leave all others */ \
				1447	if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
				1448	*pErrorCode=U_ZERO_ERROR; \
				1449	} \
				1450	} else if(length==destCapacity) { \
				1451	/* unable to NUL-terminate, but the string itself fit - set a warning code */ \
				1452	*pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
				1453	} else /* length>destCapacity */ { \
				1454	/* even the string itself did not fit - set an error code */ \
				1455	*pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
				1456	} \
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	1457	} \
				1458	} UPRV_BLOCK_MACRO_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1459
Frank Tang	f222396	2020-04-27 18:25:29 -0700	[diff] [blame]	1460	U_CAPI UChar U_EXPORT2
				1461	u_asciiToUpper(UChar c) {
				1462	if (u'a' <= c && c <= u'z') {
				1463	c = c + u'A' - u'a';
				1464	}
				1465	return c;
				1466	}
				1467
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1468	U_CAPI int32_t U_EXPORT2
				1469	u_terminateUChars(UChar dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode) {
				1470	__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
				1471	return length;
				1472	}
				1473
				1474	U_CAPI int32_t U_EXPORT2
				1475	u_terminateChars(char dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode) {
				1476	__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
				1477	return length;
				1478	}
				1479
				1480	U_CAPI int32_t U_EXPORT2
				1481	u_terminateUChar32s(UChar32 dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode) {
				1482	__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
				1483	return length;
				1484	}
				1485
				1486	U_CAPI int32_t U_EXPORT2
				1487	u_terminateWChars(wchar_t dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode) {
				1488	__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
				1489	return length;
				1490	}
				1491
				1492	// Compute the hash code for a string -------------------------------------- ***
				1493
				1494	// Moved here from uhash.c so that UnicodeString::hashCode() does not depend
				1495	// on UHashtable code.
				1496
				1497	/*
				1498	Compute the hash by iterating sparsely over about 32 (up to 63)
				1499	characters spaced evenly through the string. For each character,
				1500	multiply the previous hash value by a prime number and add the new
				1501	character in, like a linear congruential random number generator,
				1502	producing a pseudorandom deterministic value well distributed over
				1503	the output range. [LIU]
				1504	*/
				1505
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	1506	#define STRING_HASH(TYPE, STR, STRLEN, DEREF) UPRV_BLOCK_MACRO_BEGIN { \
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1507	uint32_t hash = 0; \
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1508	const TYPE p = (const TYPE) STR; \
				1509	if (p != NULL) { \
				1510	int32_t len = (int32_t)(STRLEN); \
				1511	int32_t inc = ((len - 32) / 32) + 1; \
				1512	const TYPE *limit = p + len; \
				1513	while (p<limit) { \
				1514	hash = (hash * 37) + DEREF; \
				1515	p += inc; \
				1516	} \
				1517	} \
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	1518	return static_cast<int32_t>(hash); \
				1519	} UPRV_BLOCK_MACRO_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1520
				1521	/* Used by UnicodeString to compute its hashcode - Not public API. */
				1522	U_CAPI int32_t U_EXPORT2
				1523	ustr_hashUCharsN(const UChar *str, int32_t length) {
				1524	STRING_HASH(UChar, str, length, *p);
				1525	}
				1526
				1527	U_CAPI int32_t U_EXPORT2
				1528	ustr_hashCharsN(const char *str, int32_t length) {
				1529	STRING_HASH(uint8_t, str, length, *p);
				1530	}
				1531
				1532	U_CAPI int32_t U_EXPORT2
				1533	ustr_hashICharsN(const char *str, int32_t length) {
				1534	STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
				1535	}