Blame - source/common/unisetspan.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 0a8893472f958b3fbed6f969e735508e097c829b [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	******************************************************************************
				5	*
				6	* Copyright (C) 2007-2012, International Business Machines
				7	* Corporation and others. All Rights Reserved.
				8	*
				9	******************************************************************************
				10	* file name: unisetspan.cpp
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	11	* encoding: UTF-8
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	12	* tab size: 8 (not used)
				13	* indentation:4
				14	*
				15	* created on: 2007mar01
				16	* created by: Markus W. Scherer
				17	*/
				18
				19	#include "unicode/utypes.h"
				20	#include "unicode/uniset.h"
				21	#include "unicode/ustring.h"
				22	#include "unicode/utf8.h"
				23	#include "unicode/utf16.h"
				24	#include "cmemory.h"
				25	#include "uvector.h"
				26	#include "unisetspan.h"
				27
				28	U_NAMESPACE_BEGIN
				29
				30	/*
				31	* List of offsets from the current position from where to try matching
				32	* a code point or a string.
				33	* Store offsets rather than indexes to simplify the code and use the same list
				34	* for both increments (in span()) and decrements (in spanBack()).
				35	*
				36	* Assumption: The maximum offset is limited, and the offsets that are stored
				37	* at any one time are relatively dense, that is, there are normally no gaps of
				38	* hundreds or thousands of offset values.
				39	*
				40	* The implementation uses a circular buffer of byte flags,
				41	* each indicating whether the corresponding offset is in the list.
				42	* This avoids inserting into a sorted list of offsets (or absolute indexes) and
				43	* physically moving part of the list.
				44	*
				45	* Note: In principle, the caller should setMaxLength() to the maximum of the
				46	* max string length and U16_LENGTH/U8_LENGTH to account for
				47	* "long" single code points.
				48	* However, this implementation uses at least a staticList with more than
				49	* U8_LENGTH entries anyway.
				50	*
				51	* Note: If maxLength were guaranteed to be no more than 32 or 64,
				52	* the list could be stored as bit flags in a single integer.
				53	* Rather than handling a circular buffer with a start list index,
				54	* the integer would simply be shifted when lower offsets are removed.
				55	* UnicodeSet does not have a limit on the lengths of strings.
				56	*/
				57	class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
				58	public:
				59	OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
				60
				61	~OffsetList() {
				62	if(list!=staticList) {
				63	uprv_free(list);
				64	}
				65	}
				66
				67	// Call exactly once if the list is to be used.
				68	void setMaxLength(int32_t maxLength) {
				69	if(maxLength<=(int32_t)sizeof(staticList)) {
				70	capacity=(int32_t)sizeof(staticList);
				71	} else {
				72	UBool l=(UBool )uprv_malloc(maxLength);
				73	if(l!=NULL) {
				74	list=l;
				75	capacity=maxLength;
				76	}
				77	}
				78	uprv_memset(list, 0, capacity);
				79	}
				80
				81	void clear() {
				82	uprv_memset(list, 0, capacity);
				83	start=length=0;
				84	}
				85
				86	UBool isEmpty() const {
				87	return (UBool)(length==0);
				88	}
				89
				90	// Reduce all stored offsets by delta, used when the current position
				91	// moves by delta.
				92	// There must not be any offsets lower than delta.
				93	// If there is an offset equal to delta, it is removed.
				94	// delta=[1..maxLength]
				95	void shift(int32_t delta) {
				96	int32_t i=start+delta;
				97	if(i>=capacity) {
				98	i-=capacity;
				99	}
				100	if(list[i]) {
				101	list[i]=FALSE;
				102	--length;
				103	}
				104	start=i;
				105	}
				106
				107	// Add an offset. The list must not contain it yet.
				108	// offset=[1..maxLength]
				109	void addOffset(int32_t offset) {
				110	int32_t i=start+offset;
				111	if(i>=capacity) {
				112	i-=capacity;
				113	}
				114	list[i]=TRUE;
				115	++length;
				116	}
				117
				118	// offset=[1..maxLength]
				119	UBool containsOffset(int32_t offset) const {
				120	int32_t i=start+offset;
				121	if(i>=capacity) {
				122	i-=capacity;
				123	}
				124	return list[i];
				125	}
				126
				127	// Find the lowest stored offset from a non-empty list, remove it,
				128	// and reduce all other offsets by this minimum.
				129	// Returns [1..maxLength].
				130	int32_t popMinimum() {
				131	// Look for the next offset in list[start+1..capacity-1].
				132	int32_t i=start, result;
				133	while(++i<capacity) {
				134	if(list[i]) {
				135	list[i]=FALSE;
				136	--length;
				137	result=i-start;
				138	start=i;
				139	return result;
				140	}
				141	}
				142	// i==capacity
				143
				144	// Wrap around and look for the next offset in list[0..start].
				145	// Since the list is not empty, there will be one.
				146	result=capacity-start;
				147	i=0;
				148	while(!list[i]) {
				149	++i;
				150	}
				151	list[i]=FALSE;
				152	--length;
				153	start=i;
				154	return result+=i;
				155	}
				156
				157	private:
				158	UBool *list;
				159	int32_t capacity;
				160	int32_t length;
				161	int32_t start;
				162
				163	UBool staticList[16];
				164	};
				165
				166	// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
				167	static int32_t
				168	getUTF8Length(const UChar *s, int32_t length) {
				169	UErrorCode errorCode=U_ZERO_ERROR;
				170	int32_t length8=0;
				171	u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
				172	if(U_SUCCESS(errorCode) \|\| errorCode==U_BUFFER_OVERFLOW_ERROR) {
				173	return length8;
				174	} else {
				175	// The string contains an unpaired surrogate.
				176	// Ignore this string.
				177	return 0;
				178	}
				179	}
				180
				181	// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
				182	static int32_t
				183	appendUTF8(const UChar s, int32_t length, uint8_t t, int32_t capacity) {
				184	UErrorCode errorCode=U_ZERO_ERROR;
				185	int32_t length8=0;
				186	u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
				187	if(U_SUCCESS(errorCode)) {
				188	return length8;
				189	} else {
				190	// The string contains an unpaired surrogate.
				191	// Ignore this string.
				192	return 0;
				193	}
				194	}
				195
				196	static inline uint8_t
				197	makeSpanLengthByte(int32_t spanLength) {
				198	// 0xfe==UnicodeSetStringSpan::LONG_SPAN
				199	return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
				200	}
				201
				202	// Construct for all variants of span(), or only for any one variant.
				203	// Initialize as little as possible, for single use.
				204	UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
				205	const UVector &setStrings,
				206	uint32_t which)
				207	: spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
				208	utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
				209	utf8Length(0),
				210	maxLength16(0), maxLength8(0),
				211	all((UBool)(which==ALL)) {
				212	spanSet.retainAll(set);
				213	if(which&NOT_CONTAINED) {
				214	// Default to the same sets.
				215	// addToSpanNotSet() will create a separate set if necessary.
				216	pSpanNotSet=&spanSet;
				217	}
				218
				219	// Determine if the strings even need to be taken into account at all for span() etc.
				220	// If any string is relevant, then all strings need to be used for
				221	// span(longest match) but only the relevant ones for span(while contained).
				222	// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
				223	// and do not store UTF-8 strings if !thisRelevant and CONTAINED.
				224	// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
				225	// Also count the lengths of the UTF-8 versions of the strings for memory allocation.
				226	int32_t stringsLength=strings.size();
				227
				228	int32_t i, spanLength;
				229	UBool someRelevant=FALSE;
				230	for(i=0; i<stringsLength; ++i) {
				231	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				232	const UChar *s16=string.getBuffer();
				233	int32_t length16=string.length();
				234	UBool thisRelevant;
				235	spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
				236	if(spanLength<length16) { // Relevant string.
				237	someRelevant=thisRelevant=TRUE;
				238	} else {
				239	thisRelevant=FALSE;
				240	}
				241	if((which&UTF16) && length16>maxLength16) {
				242	maxLength16=length16;
				243	}
				244	if((which&UTF8) && (thisRelevant \|\| (which&CONTAINED))) {
				245	int32_t length8=getUTF8Length(s16, length16);
				246	utf8Length+=length8;
				247	if(length8>maxLength8) {
				248	maxLength8=length8;
				249	}
				250	}
				251	}
				252	if(!someRelevant) {
				253	maxLength16=maxLength8=0;
				254	return;
				255	}
				256
				257	// Freeze after checking for the need to use strings at all because freezing
				258	// a set takes some time and memory which are wasted if there are no relevant strings.
				259	if(all) {
				260	spanSet.freeze();
				261	}
				262
				263	uint8_t *spanBackLengths;
				264	uint8_t *spanUTF8Lengths;
				265	uint8_t *spanBackUTF8Lengths;
				266
				267	// Allocate a block of meta data.
				268	int32_t allocSize;
				269	if(all) {
				270	// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
				271	allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
				272	} else {
				273	allocSize=stringsLength; // One set of span lengths.
				274	if(which&UTF8) {
				275	// UTF-8 lengths and UTF-8 strings.
				276	allocSize+=stringsLength*4+utf8Length;
				277	}
				278	}
				279	if(allocSize<=(int32_t)sizeof(staticLengths)) {
				280	utf8Lengths=staticLengths;
				281	} else {
				282	utf8Lengths=(int32_t *)uprv_malloc(allocSize);
				283	if(utf8Lengths==NULL) {
				284	maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
				285	return; // Out of memory.
				286	}
				287	}
				288
				289	if(all) {
				290	// Store span lengths for all span() variants.
				291	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
				292	spanBackLengths=spanLengths+stringsLength;
				293	spanUTF8Lengths=spanBackLengths+stringsLength;
				294	spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
				295	utf8=spanBackUTF8Lengths+stringsLength;
				296	} else {
				297	// Store span lengths for only one span() variant.
				298	if(which&UTF8) {
				299	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
				300	utf8=spanLengths+stringsLength;
				301	} else {
				302	spanLengths=(uint8_t *)utf8Lengths;
				303	}
				304	spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
				305	}
				306
				307	// Set the meta data and pSpanNotSet and write the UTF-8 strings.
				308	int32_t utf8Count=0; // Count UTF-8 bytes written so far.
				309
				310	for(i=0; i<stringsLength; ++i) {
				311	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				312	const UChar *s16=string.getBuffer();
				313	int32_t length16=string.length();
				314	spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
				315	if(spanLength<length16) { // Relevant string.
				316	if(which&UTF16) {
				317	if(which&CONTAINED) {
				318	if(which&FWD) {
				319	spanLengths[i]=makeSpanLengthByte(spanLength);
				320	}
				321	if(which&BACK) {
				322	spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
				323	spanBackLengths[i]=makeSpanLengthByte(spanLength);
				324	}
				325	} else /* not CONTAINED, not all, but NOT_CONTAINED */ {
				326	spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
				327	}
				328	}
				329	if(which&UTF8) {
				330	uint8_t *s8=utf8+utf8Count;
				331	int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
				332	utf8Count+=utf8Lengths[i]=length8;
				333	if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
				334	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
				335	} else { // Relevant for UTF-8.
				336	if(which&CONTAINED) {
				337	if(which&FWD) {
				338	spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
				339	spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
				340	}
				341	if(which&BACK) {
				342	spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
				343	spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
				344	}
				345	} else /* not CONTAINED, not all, but NOT_CONTAINED */ {
				346	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
				347	}
				348	}
				349	}
				350	if(which&NOT_CONTAINED) {
				351	// Add string start and end code points to the spanNotSet so that
				352	// a span(while not contained) stops before any string.
				353	UChar32 c;
				354	if(which&FWD) {
				355	int32_t len=0;
				356	U16_NEXT(s16, len, length16, c);
				357	addToSpanNotSet(c);
				358	}
				359	if(which&BACK) {
				360	int32_t len=length16;
				361	U16_PREV(s16, 0, len, c);
				362	addToSpanNotSet(c);
				363	}
				364	}
				365	} else { // Irrelevant string.
				366	if(which&UTF8) {
				367	if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
				368	uint8_t *s8=utf8+utf8Count;
				369	int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
				370	utf8Count+=utf8Lengths[i]=length8;
				371	} else {
				372	utf8Lengths[i]=0;
				373	}
				374	}
				375	if(all) {
				376	spanLengths[i]=spanBackLengths[i]=
				377	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
				378	(uint8_t)ALL_CP_CONTAINED;
				379	} else {
				380	// All spanXYZLengths pointers contain the same address.
				381	spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
				382	}
				383	}
				384	}
				385
				386	// Finish.
				387	if(all) {
				388	pSpanNotSet->freeze();
				389	}
				390	}
				391
				392	// Copy constructor. Assumes which==ALL for a frozen set.
				393	UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
				394	const UVector &newParentSetStrings)
				395	: spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
				396	utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
				397	utf8Length(otherStringSpan.utf8Length),
				398	maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
				399	all(TRUE) {
				400	if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
				401	pSpanNotSet=&spanSet;
				402	} else {
				403	pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
				404	}
				405
				406	// Allocate a block of meta data.
				407	// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
				408	int32_t stringsLength=strings.size();
				409	int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
				410	if(allocSize<=(int32_t)sizeof(staticLengths)) {
				411	utf8Lengths=staticLengths;
				412	} else {
				413	utf8Lengths=(int32_t *)uprv_malloc(allocSize);
				414	if(utf8Lengths==NULL) {
				415	maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
				416	return; // Out of memory.
				417	}
				418	}
				419
				420	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
				421	utf8=spanLengths+stringsLength*4;
				422	uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
				423	}
				424
				425	UnicodeSetStringSpan::~UnicodeSetStringSpan() {
				426	if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
				427	delete pSpanNotSet;
				428	}
				429	if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
				430	uprv_free(utf8Lengths);
				431	}
				432	}
				433
				434	void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
				435	if(pSpanNotSet==NULL \|\| pSpanNotSet==&spanSet) {
				436	if(spanSet.contains(c)) {
				437	return; // Nothing to do.
				438	}
				439	UnicodeSet newSet=(UnicodeSet )spanSet.cloneAsThawed();
				440	if(newSet==NULL) {
				441	return; // Out of memory.
				442	} else {
				443	pSpanNotSet=newSet;
				444	}
				445	}
				446	pSpanNotSet->add(c);
				447	}
				448
				449	// Compare strings without any argument checks. Requires length>0.
				450	static inline UBool
				451	matches16(const UChar s, const UChar t, int32_t length) {
				452	do {
				453	if(s++!=t++) {
				454	return FALSE;
				455	}
				456	} while(--length>0);
				457	return TRUE;
				458	}
				459
				460	static inline UBool
				461	matches8(const uint8_t s, const uint8_t t, int32_t length) {
				462	do {
				463	if(s++!=t++) {
				464	return FALSE;
				465	}
				466	} while(--length>0);
				467	return TRUE;
				468	}
				469
				470	// Compare 16-bit Unicode strings (which may be malformed UTF-16)
				471	// at code point boundaries.
				472	// That is, each edge of a match must not be in the middle of a surrogate pair.
				473	static inline UBool
				474	matches16CPB(const UChar s, int32_t start, int32_t limit, const UChar t, int32_t length) {
				475	s+=start;
				476	limit-=start;
				477	return matches16(s, t, length) &&
				478	!(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
				479	!(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
				480	}
				481
				482	// Does the set contain the next code point?
				483	// If so, return its length; otherwise return its negative length.
				484	static inline int32_t
				485	spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
				486	UChar c=*s, c2;
				487	if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
				488	return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
				489	}
				490	return set.contains(c) ? 1 : -1;
				491	}
				492
				493	static inline int32_t
				494	spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
				495	UChar c=s[length-1], c2;
				496	if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
				497	return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
				498	}
				499	return set.contains(c) ? 1 : -1;
				500	}
				501
				502	static inline int32_t
				503	spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
				504	UChar32 c=*s;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	505	if(U8_IS_SINGLE(c)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	506	return set.contains(c) ? 1 : -1;
				507	}
				508	// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
				509	int32_t i=0;
				510	U8_NEXT_OR_FFFD(s, i, length, c);
				511	return set.contains(c) ? i : -i;
				512	}
				513
				514	static inline int32_t
				515	spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
				516	UChar32 c=s[length-1];
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	517	if(U8_IS_SINGLE(c)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	518	return set.contains(c) ? 1 : -1;
				519	}
				520	int32_t i=length-1;
				521	c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
				522	length-=i;
				523	return set.contains(c) ? length : -length;
				524	}
				525
				526	/*
				527	* Note: In span() when spanLength==0 (after a string match, or at the beginning
				528	* after an empty code point span) and in spanNot() and spanNotUTF8(),
				529	* string matching could use a binary search
				530	* because all string matches are done from the same start index.
				531	*
				532	* For UTF-8, this would require a comparison function that returns UTF-16 order.
				533	*
				534	* This optimization should not be necessary for normal UnicodeSets because
				535	* most sets have no strings, and most sets with strings have
				536	* very few very short strings.
				537	* For cases with many strings, it might be better to use a different API
				538	* and implementation with a DFA (state machine).
				539	*/
				540
				541	/*
				542	* Algorithm for span(USET_SPAN_CONTAINED)
				543	*
				544	* Theoretical algorithm:
				545	* - Iterate through the string, and at each code point boundary:
				546	* + If the code point there is in the set, then remember to continue after it.
				547	* + If a set string matches at the current position, then remember to continue after it.
				548	* + Either recursively span for each code point or string match,
				549	* or recursively span for all but the shortest one and
				550	* iteratively continue the span with the shortest local match.
				551	* + Remember the longest recursive span (the farthest end point).
				552	* + If there is no match at the current position, neither for the code point there
				553	* nor for any set string, then stop and return the longest recursive span length.
				554	*
				555	* Optimized implementation:
				556	*
				557	* (We assume that most sets will have very few very short strings.
				558	* A span using a string-less set is extremely fast.)
				559	*
				560	* Create and cache a spanSet which contains all of the single code points
				561	* of the original set but none of its strings.
				562	*
				563	* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
				564	* - Loop:
				565	* + Try to match each set string at the end of the spanLength.
				566	* ~ Set strings that start with set-contained code points must be matched
				567	* with a partial overlap because the recursive algorithm would have tried
				568	* to match them at every position.
				569	* ~ Set strings that entirely consist of set-contained code points
				570	* are irrelevant for span(USET_SPAN_CONTAINED) because the
				571	* recursive algorithm would continue after them anyway
				572	* and find the longest recursive match from their end.
				573	* ~ Rather than recursing, note each end point of a set string match.
				574	* + If no set string matched after spanSet.span(), then return
				575	* with where the spanSet.span() ended.
				576	* + If at least one set string matched after spanSet.span(), then
				577	* pop the shortest string match end point and continue
				578	* the loop, trying to match all set strings from there.
				579	* + If at least one more set string matched after a previous string match,
				580	* then test if the code point after the previous string match is also
				581	* contained in the set.
				582	* Continue the loop with the shortest end point of either this code point
				583	* or a matching set string.
				584	* + If no more set string matched after a previous string match,
				585	* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
				586	* Stop if spanLength==0, otherwise continue the loop.
				587	*
				588	* By noting each end point of a set string match,
				589	* the function visits each string position at most once and finishes
				590	* in linear time.
				591	*
				592	* The recursive algorithm may visit the same string position many times
				593	* if multiple paths lead to it and finishes in exponential time.
				594	*/
				595
				596	/*
				597	* Algorithm for span(USET_SPAN_SIMPLE)
				598	*
				599	* Theoretical algorithm:
				600	* - Iterate through the string, and at each code point boundary:
				601	* + If the code point there is in the set, then remember to continue after it.
				602	* + If a set string matches at the current position, then remember to continue after it.
				603	* + Continue from the farthest match position and ignore all others.
				604	* + If there is no match at the current position,
				605	* then stop and return the current position.
				606	*
				607	* Optimized implementation:
				608	*
				609	* (Same assumption and spanSet as above.)
				610	*
				611	* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
				612	* - Loop:
				613	* + Try to match each set string at the end of the spanLength.
				614	* ~ Set strings that start with set-contained code points must be matched
				615	* with a partial overlap because the standard algorithm would have tried
				616	* to match them earlier.
				617	* ~ Set strings that entirely consist of set-contained code points
				618	* must be matched with a full overlap because the longest-match algorithm
				619	* would hide set string matches that end earlier.
				620	* Such set strings need not be matched earlier inside the code point span
				621	* because the standard algorithm would then have continued after
				622	* the set string match anyway.
				623	* ~ Remember the longest set string match (farthest end point) from the earliest
				624	* starting point.
				625	* + If no set string matched after spanSet.span(), then return
				626	* with where the spanSet.span() ended.
				627	* + If at least one set string matched, then continue the loop after the
				628	* longest match from the earliest position.
				629	* + If no more set string matched after a previous string match,
				630	* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
				631	* Stop if spanLength==0, otherwise continue the loop.
				632	*/
				633
				634	int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
				635	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
				636	return spanNot(s, length);
				637	}
				638	int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
				639	if(spanLength==length) {
				640	return length;
				641	}
				642
				643	// Consider strings; they may overlap with the span.
				644	OffsetList offsets;
				645	if(spanCondition==USET_SPAN_CONTAINED) {
				646	// Use offset list to try all possibilities.
				647	offsets.setMaxLength(maxLength16);
				648	}
				649	int32_t pos=spanLength, rest=length-pos;
				650	int32_t i, stringsLength=strings.size();
				651	for(;;) {
				652	if(spanCondition==USET_SPAN_CONTAINED) {
				653	for(i=0; i<stringsLength; ++i) {
				654	int32_t overlap=spanLengths[i];
				655	if(overlap==ALL_CP_CONTAINED) {
				656	continue; // Irrelevant string.
				657	}
				658	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				659	const UChar *s16=string.getBuffer();
				660	int32_t length16=string.length();
				661
				662	// Try to match this string at pos-overlap..pos.
				663	if(overlap>=LONG_SPAN) {
				664	overlap=length16;
				665	// While contained: No point matching fully inside the code point span.
				666	U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
				667	}
				668	if(overlap>spanLength) {
				669	overlap=spanLength;
				670	}
				671	int32_t inc=length16-overlap; // Keep overlap+inc==length16.
				672	for(;;) {
				673	if(inc>rest) {
				674	break;
				675	}
				676	// Try to match if the increment is not listed already.
				677	if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
				678	if(inc==rest) {
				679	return length; // Reached the end of the string.
				680	}
				681	offsets.addOffset(inc);
				682	}
				683	if(overlap==0) {
				684	break;
				685	}
				686	--overlap;
				687	++inc;
				688	}
				689	}
				690	} else /* USET_SPAN_SIMPLE */ {
				691	int32_t maxInc=0, maxOverlap=0;
				692	for(i=0; i<stringsLength; ++i) {
				693	int32_t overlap=spanLengths[i];
				694	// For longest match, we do need to try to match even an all-contained string
				695	// to find the match from the earliest start.
				696
				697	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				698	const UChar *s16=string.getBuffer();
				699	int32_t length16=string.length();
				700
				701	// Try to match this string at pos-overlap..pos.
				702	if(overlap>=LONG_SPAN) {
				703	overlap=length16;
				704	// Longest match: Need to match fully inside the code point span
				705	// to find the match from the earliest start.
				706	}
				707	if(overlap>spanLength) {
				708	overlap=spanLength;
				709	}
				710	int32_t inc=length16-overlap; // Keep overlap+inc==length16.
				711	for(;;) {
				712	if(inc>rest \|\| overlap<maxOverlap) {
				713	break;
				714	}
				715	// Try to match if the string is longer or starts earlier.
				716	if( (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ inc>maxInc) &&
				717	matches16CPB(s, pos-overlap, length, s16, length16)
				718	) {
				719	maxInc=inc; // Longest match from earliest start.
				720	maxOverlap=overlap;
				721	break;
				722	}
				723	--overlap;
				724	++inc;
				725	}
				726	}
				727
				728	if(maxInc!=0 \|\| maxOverlap!=0) {
				729	// Longest-match algorithm, and there was a string match.
				730	// Simply continue after it.
				731	pos+=maxInc;
				732	rest-=maxInc;
				733	if(rest==0) {
				734	return length; // Reached the end of the string.
				735	}
				736	spanLength=0; // Match strings from after a string match.
				737	continue;
				738	}
				739	}
				740	// Finished trying to match all strings at pos.
				741
				742	if(spanLength!=0 \|\| pos==0) {
				743	// The position is after an unlimited code point span (spanLength!=0),
				744	// not after a string match.
				745	// The only position where spanLength==0 after a span is pos==0.
				746	// Otherwise, an unlimited code point span is only tried again when no
				747	// strings match, and if such a non-initial span fails we stop.
				748	if(offsets.isEmpty()) {
				749	return pos; // No strings matched after a span.
				750	}
				751	// Match strings from after the next string match.
				752	} else {
				753	// The position is after a string match (or a single code point).
				754	if(offsets.isEmpty()) {
				755	// No more strings matched after a previous string match.
				756	// Try another code point span from after the last string match.
				757	spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
				758	if( spanLength==rest \|\| // Reached the end of the string, or
				759	spanLength==0 // neither strings nor span progressed.
				760	) {
				761	return pos+spanLength;
				762	}
				763	pos+=spanLength;
				764	rest-=spanLength;
				765	continue; // spanLength>0: Match strings from after a span.
				766	} else {
				767	// Try to match only one code point from after a string match if some
				768	// string matched beyond it, so that we try all possible positions
				769	// and don't overshoot.
				770	spanLength=spanOne(spanSet, s+pos, rest);
				771	if(spanLength>0) {
				772	if(spanLength==rest) {
				773	return length; // Reached the end of the string.
				774	}
				775	// Match strings after this code point.
				776	// There cannot be any increments below it because UnicodeSet strings
				777	// contain multiple code points.
				778	pos+=spanLength;
				779	rest-=spanLength;
				780	offsets.shift(spanLength);
				781	spanLength=0;
				782	continue; // Match strings from after a single code point.
				783	}
				784	// Match strings from after the next string match.
				785	}
				786	}
				787	int32_t minOffset=offsets.popMinimum();
				788	pos+=minOffset;
				789	rest-=minOffset;
				790	spanLength=0; // Match strings from after a string match.
				791	}
				792	}
				793
				794	int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
				795	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
				796	return spanNotBack(s, length);
				797	}
				798	int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
				799	if(pos==0) {
				800	return 0;
				801	}
				802	int32_t spanLength=length-pos;
				803
				804	// Consider strings; they may overlap with the span.
				805	OffsetList offsets;
				806	if(spanCondition==USET_SPAN_CONTAINED) {
				807	// Use offset list to try all possibilities.
				808	offsets.setMaxLength(maxLength16);
				809	}
				810	int32_t i, stringsLength=strings.size();
				811	uint8_t *spanBackLengths=spanLengths;
				812	if(all) {
				813	spanBackLengths+=stringsLength;
				814	}
				815	for(;;) {
				816	if(spanCondition==USET_SPAN_CONTAINED) {
				817	for(i=0; i<stringsLength; ++i) {
				818	int32_t overlap=spanBackLengths[i];
				819	if(overlap==ALL_CP_CONTAINED) {
				820	continue; // Irrelevant string.
				821	}
				822	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				823	const UChar *s16=string.getBuffer();
				824	int32_t length16=string.length();
				825
				826	// Try to match this string at pos-(length16-overlap)..pos-length16.
				827	if(overlap>=LONG_SPAN) {
				828	overlap=length16;
				829	// While contained: No point matching fully inside the code point span.
				830	int32_t len1=0;
				831	U16_FWD_1(s16, len1, overlap);
				832	overlap-=len1; // Length of the string minus the first code point.
				833	}
				834	if(overlap>spanLength) {
				835	overlap=spanLength;
				836	}
				837	int32_t dec=length16-overlap; // Keep dec+overlap==length16.
				838	for(;;) {
				839	if(dec>pos) {
				840	break;
				841	}
				842	// Try to match if the decrement is not listed already.
				843	if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
				844	if(dec==pos) {
				845	return 0; // Reached the start of the string.
				846	}
				847	offsets.addOffset(dec);
				848	}
				849	if(overlap==0) {
				850	break;
				851	}
				852	--overlap;
				853	++dec;
				854	}
				855	}
				856	} else /* USET_SPAN_SIMPLE */ {
				857	int32_t maxDec=0, maxOverlap=0;
				858	for(i=0; i<stringsLength; ++i) {
				859	int32_t overlap=spanBackLengths[i];
				860	// For longest match, we do need to try to match even an all-contained string
				861	// to find the match from the latest end.
				862
				863	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				864	const UChar *s16=string.getBuffer();
				865	int32_t length16=string.length();
				866
				867	// Try to match this string at pos-(length16-overlap)..pos-length16.
				868	if(overlap>=LONG_SPAN) {
				869	overlap=length16;
				870	// Longest match: Need to match fully inside the code point span
				871	// to find the match from the latest end.
				872	}
				873	if(overlap>spanLength) {
				874	overlap=spanLength;
				875	}
				876	int32_t dec=length16-overlap; // Keep dec+overlap==length16.
				877	for(;;) {
				878	if(dec>pos \|\| overlap<maxOverlap) {
				879	break;
				880	}
				881	// Try to match if the string is longer or ends later.
				882	if( (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ dec>maxDec) &&
				883	matches16CPB(s, pos-dec, length, s16, length16)
				884	) {
				885	maxDec=dec; // Longest match from latest end.
				886	maxOverlap=overlap;
				887	break;
				888	}
				889	--overlap;
				890	++dec;
				891	}
				892	}
				893
				894	if(maxDec!=0 \|\| maxOverlap!=0) {
				895	// Longest-match algorithm, and there was a string match.
				896	// Simply continue before it.
				897	pos-=maxDec;
				898	if(pos==0) {
				899	return 0; // Reached the start of the string.
				900	}
				901	spanLength=0; // Match strings from before a string match.
				902	continue;
				903	}
				904	}
				905	// Finished trying to match all strings at pos.
				906
				907	if(spanLength!=0 \|\| pos==length) {
				908	// The position is before an unlimited code point span (spanLength!=0),
				909	// not before a string match.
				910	// The only position where spanLength==0 before a span is pos==length.
				911	// Otherwise, an unlimited code point span is only tried again when no
				912	// strings match, and if such a non-initial span fails we stop.
				913	if(offsets.isEmpty()) {
				914	return pos; // No strings matched before a span.
				915	}
				916	// Match strings from before the next string match.
				917	} else {
				918	// The position is before a string match (or a single code point).
				919	if(offsets.isEmpty()) {
				920	// No more strings matched before a previous string match.
				921	// Try another code point span from before the last string match.
				922	int32_t oldPos=pos;
				923	pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
				924	spanLength=oldPos-pos;
				925	if( pos==0 \|\| // Reached the start of the string, or
				926	spanLength==0 // neither strings nor span progressed.
				927	) {
				928	return pos;
				929	}
				930	continue; // spanLength>0: Match strings from before a span.
				931	} else {
				932	// Try to match only one code point from before a string match if some
				933	// string matched beyond it, so that we try all possible positions
				934	// and don't overshoot.
				935	spanLength=spanOneBack(spanSet, s, pos);
				936	if(spanLength>0) {
				937	if(spanLength==pos) {
				938	return 0; // Reached the start of the string.
				939	}
				940	// Match strings before this code point.
				941	// There cannot be any decrements below it because UnicodeSet strings
				942	// contain multiple code points.
				943	pos-=spanLength;
				944	offsets.shift(spanLength);
				945	spanLength=0;
				946	continue; // Match strings from before a single code point.
				947	}
				948	// Match strings from before the next string match.
				949	}
				950	}
				951	pos-=offsets.popMinimum();
				952	spanLength=0; // Match strings from before a string match.
				953	}
				954	}
				955
				956	int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
				957	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
				958	return spanNotUTF8(s, length);
				959	}
				960	int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
				961	if(spanLength==length) {
				962	return length;
				963	}
				964
				965	// Consider strings; they may overlap with the span.
				966	OffsetList offsets;
				967	if(spanCondition==USET_SPAN_CONTAINED) {
				968	// Use offset list to try all possibilities.
				969	offsets.setMaxLength(maxLength8);
				970	}
				971	int32_t pos=spanLength, rest=length-pos;
				972	int32_t i, stringsLength=strings.size();
				973	uint8_t *spanUTF8Lengths=spanLengths;
				974	if(all) {
				975	spanUTF8Lengths+=2*stringsLength;
				976	}
				977	for(;;) {
				978	const uint8_t *s8=utf8;
				979	int32_t length8;
				980	if(spanCondition==USET_SPAN_CONTAINED) {
				981	for(i=0; i<stringsLength; ++i) {
				982	length8=utf8Lengths[i];
				983	if(length8==0) {
				984	continue; // String not representable in UTF-8.
				985	}
				986	int32_t overlap=spanUTF8Lengths[i];
				987	if(overlap==ALL_CP_CONTAINED) {
				988	s8+=length8;
				989	continue; // Irrelevant string.
				990	}
				991
				992	// Try to match this string at pos-overlap..pos.
				993	if(overlap>=LONG_SPAN) {
				994	overlap=length8;
				995	// While contained: No point matching fully inside the code point span.
				996	U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
				997	}
				998	if(overlap>spanLength) {
				999	overlap=spanLength;
				1000	}
				1001	int32_t inc=length8-overlap; // Keep overlap+inc==length8.
				1002	for(;;) {
				1003	if(inc>rest) {
				1004	break;
				1005	}
				1006	// Try to match if the increment is not listed already.
				1007	// Match at code point boundaries. (The UTF-8 strings were converted
				1008	// from UTF-16 and are guaranteed to be well-formed.)
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	1009	if(!U8_IS_TRAIL(s[pos-overlap]) &&
				1010	!offsets.containsOffset(inc) &&
				1011	matches8(s+pos-overlap, s8, length8)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1012	if(inc==rest) {
				1013	return length; // Reached the end of the string.
				1014	}
				1015	offsets.addOffset(inc);
				1016	}
				1017	if(overlap==0) {
				1018	break;
				1019	}
				1020	--overlap;
				1021	++inc;
				1022	}
				1023	s8+=length8;
				1024	}
				1025	} else /* USET_SPAN_SIMPLE */ {
				1026	int32_t maxInc=0, maxOverlap=0;
				1027	for(i=0; i<stringsLength; ++i) {
				1028	length8=utf8Lengths[i];
				1029	if(length8==0) {
				1030	continue; // String not representable in UTF-8.
				1031	}
				1032	int32_t overlap=spanUTF8Lengths[i];
				1033	// For longest match, we do need to try to match even an all-contained string
				1034	// to find the match from the earliest start.
				1035
				1036	// Try to match this string at pos-overlap..pos.
				1037	if(overlap>=LONG_SPAN) {
				1038	overlap=length8;
				1039	// Longest match: Need to match fully inside the code point span
				1040	// to find the match from the earliest start.
				1041	}
				1042	if(overlap>spanLength) {
				1043	overlap=spanLength;
				1044	}
				1045	int32_t inc=length8-overlap; // Keep overlap+inc==length8.
				1046	for(;;) {
				1047	if(inc>rest \|\| overlap<maxOverlap) {
				1048	break;
				1049	}
				1050	// Try to match if the string is longer or starts earlier.
				1051	// Match at code point boundaries. (The UTF-8 strings were converted
				1052	// from UTF-16 and are guaranteed to be well-formed.)
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	1053	if(!U8_IS_TRAIL(s[pos-overlap]) &&
				1054	(overlap>maxOverlap \|\|
				1055	/* redundant overlap==maxOverlap && */ inc>maxInc) &&
				1056	matches8(s+pos-overlap, s8, length8)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1057	maxInc=inc; // Longest match from earliest start.
				1058	maxOverlap=overlap;
				1059	break;
				1060	}
				1061	--overlap;
				1062	++inc;
				1063	}
				1064	s8+=length8;
				1065	}
				1066
				1067	if(maxInc!=0 \|\| maxOverlap!=0) {
				1068	// Longest-match algorithm, and there was a string match.
				1069	// Simply continue after it.
				1070	pos+=maxInc;
				1071	rest-=maxInc;
				1072	if(rest==0) {
				1073	return length; // Reached the end of the string.
				1074	}
				1075	spanLength=0; // Match strings from after a string match.
				1076	continue;
				1077	}
				1078	}
				1079	// Finished trying to match all strings at pos.
				1080
				1081	if(spanLength!=0 \|\| pos==0) {
				1082	// The position is after an unlimited code point span (spanLength!=0),
				1083	// not after a string match.
				1084	// The only position where spanLength==0 after a span is pos==0.
				1085	// Otherwise, an unlimited code point span is only tried again when no
				1086	// strings match, and if such a non-initial span fails we stop.
				1087	if(offsets.isEmpty()) {
				1088	return pos; // No strings matched after a span.
				1089	}
				1090	// Match strings from after the next string match.
				1091	} else {
				1092	// The position is after a string match (or a single code point).
				1093	if(offsets.isEmpty()) {
				1094	// No more strings matched after a previous string match.
				1095	// Try another code point span from after the last string match.
				1096	spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
				1097	if( spanLength==rest \|\| // Reached the end of the string, or
				1098	spanLength==0 // neither strings nor span progressed.
				1099	) {
				1100	return pos+spanLength;
				1101	}
				1102	pos+=spanLength;
				1103	rest-=spanLength;
				1104	continue; // spanLength>0: Match strings from after a span.
				1105	} else {
				1106	// Try to match only one code point from after a string match if some
				1107	// string matched beyond it, so that we try all possible positions
				1108	// and don't overshoot.
				1109	spanLength=spanOneUTF8(spanSet, s+pos, rest);
				1110	if(spanLength>0) {
				1111	if(spanLength==rest) {
				1112	return length; // Reached the end of the string.
				1113	}
				1114	// Match strings after this code point.
				1115	// There cannot be any increments below it because UnicodeSet strings
				1116	// contain multiple code points.
				1117	pos+=spanLength;
				1118	rest-=spanLength;
				1119	offsets.shift(spanLength);
				1120	spanLength=0;
				1121	continue; // Match strings from after a single code point.
				1122	}
				1123	// Match strings from after the next string match.
				1124	}
				1125	}
				1126	int32_t minOffset=offsets.popMinimum();
				1127	pos+=minOffset;
				1128	rest-=minOffset;
				1129	spanLength=0; // Match strings from after a string match.
				1130	}
				1131	}
				1132
				1133	int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
				1134	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
				1135	return spanNotBackUTF8(s, length);
				1136	}
				1137	int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
				1138	if(pos==0) {
				1139	return 0;
				1140	}
				1141	int32_t spanLength=length-pos;
				1142
				1143	// Consider strings; they may overlap with the span.
				1144	OffsetList offsets;
				1145	if(spanCondition==USET_SPAN_CONTAINED) {
				1146	// Use offset list to try all possibilities.
				1147	offsets.setMaxLength(maxLength8);
				1148	}
				1149	int32_t i, stringsLength=strings.size();
				1150	uint8_t *spanBackUTF8Lengths=spanLengths;
				1151	if(all) {
				1152	spanBackUTF8Lengths+=3*stringsLength;
				1153	}
				1154	for(;;) {
				1155	const uint8_t *s8=utf8;
				1156	int32_t length8;
				1157	if(spanCondition==USET_SPAN_CONTAINED) {
				1158	for(i=0; i<stringsLength; ++i) {
				1159	length8=utf8Lengths[i];
				1160	if(length8==0) {
				1161	continue; // String not representable in UTF-8.
				1162	}
				1163	int32_t overlap=spanBackUTF8Lengths[i];
				1164	if(overlap==ALL_CP_CONTAINED) {
				1165	s8+=length8;
				1166	continue; // Irrelevant string.
				1167	}
				1168
				1169	// Try to match this string at pos-(length8-overlap)..pos-length8.
				1170	if(overlap>=LONG_SPAN) {
				1171	overlap=length8;
				1172	// While contained: No point matching fully inside the code point span.
				1173	int32_t len1=0;
				1174	U8_FWD_1(s8, len1, overlap);
				1175	overlap-=len1; // Length of the string minus the first code point.
				1176	}
				1177	if(overlap>spanLength) {
				1178	overlap=spanLength;
				1179	}
				1180	int32_t dec=length8-overlap; // Keep dec+overlap==length8.
				1181	for(;;) {
				1182	if(dec>pos) {
				1183	break;
				1184	}
				1185	// Try to match if the decrement is not listed already.
				1186	// Match at code point boundaries. (The UTF-8 strings were converted
				1187	// from UTF-16 and are guaranteed to be well-formed.)
				1188	if( !U8_IS_TRAIL(s[pos-dec]) &&
				1189	!offsets.containsOffset(dec) &&
				1190	matches8(s+pos-dec, s8, length8)
				1191	) {
				1192	if(dec==pos) {
				1193	return 0; // Reached the start of the string.
				1194	}
				1195	offsets.addOffset(dec);
				1196	}
				1197	if(overlap==0) {
				1198	break;
				1199	}
				1200	--overlap;
				1201	++dec;
				1202	}
				1203	s8+=length8;
				1204	}
				1205	} else /* USET_SPAN_SIMPLE */ {
				1206	int32_t maxDec=0, maxOverlap=0;
				1207	for(i=0; i<stringsLength; ++i) {
				1208	length8=utf8Lengths[i];
				1209	if(length8==0) {
				1210	continue; // String not representable in UTF-8.
				1211	}
				1212	int32_t overlap=spanBackUTF8Lengths[i];
				1213	// For longest match, we do need to try to match even an all-contained string
				1214	// to find the match from the latest end.
				1215
				1216	// Try to match this string at pos-(length8-overlap)..pos-length8.
				1217	if(overlap>=LONG_SPAN) {
				1218	overlap=length8;
				1219	// Longest match: Need to match fully inside the code point span
				1220	// to find the match from the latest end.
				1221	}
				1222	if(overlap>spanLength) {
				1223	overlap=spanLength;
				1224	}
				1225	int32_t dec=length8-overlap; // Keep dec+overlap==length8.
				1226	for(;;) {
				1227	if(dec>pos \|\| overlap<maxOverlap) {
				1228	break;
				1229	}
				1230	// Try to match if the string is longer or ends later.
				1231	// Match at code point boundaries. (The UTF-8 strings were converted
				1232	// from UTF-16 and are guaranteed to be well-formed.)
				1233	if( !U8_IS_TRAIL(s[pos-dec]) &&
				1234	(overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ dec>maxDec) &&
				1235	matches8(s+pos-dec, s8, length8)
				1236	) {
				1237	maxDec=dec; // Longest match from latest end.
				1238	maxOverlap=overlap;
				1239	break;
				1240	}
				1241	--overlap;
				1242	++dec;
				1243	}
				1244	s8+=length8;
				1245	}
				1246
				1247	if(maxDec!=0 \|\| maxOverlap!=0) {
				1248	// Longest-match algorithm, and there was a string match.
				1249	// Simply continue before it.
				1250	pos-=maxDec;
				1251	if(pos==0) {
				1252	return 0; // Reached the start of the string.
				1253	}
				1254	spanLength=0; // Match strings from before a string match.
				1255	continue;
				1256	}
				1257	}
				1258	// Finished trying to match all strings at pos.
				1259
				1260	if(spanLength!=0 \|\| pos==length) {
				1261	// The position is before an unlimited code point span (spanLength!=0),
				1262	// not before a string match.
				1263	// The only position where spanLength==0 before a span is pos==length.
				1264	// Otherwise, an unlimited code point span is only tried again when no
				1265	// strings match, and if such a non-initial span fails we stop.
				1266	if(offsets.isEmpty()) {
				1267	return pos; // No strings matched before a span.
				1268	}
				1269	// Match strings from before the next string match.
				1270	} else {
				1271	// The position is before a string match (or a single code point).
				1272	if(offsets.isEmpty()) {
				1273	// No more strings matched before a previous string match.
				1274	// Try another code point span from before the last string match.
				1275	int32_t oldPos=pos;
				1276	pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
				1277	spanLength=oldPos-pos;
				1278	if( pos==0 \|\| // Reached the start of the string, or
				1279	spanLength==0 // neither strings nor span progressed.
				1280	) {
				1281	return pos;
				1282	}
				1283	continue; // spanLength>0: Match strings from before a span.
				1284	} else {
				1285	// Try to match only one code point from before a string match if some
				1286	// string matched beyond it, so that we try all possible positions
				1287	// and don't overshoot.
				1288	spanLength=spanOneBackUTF8(spanSet, s, pos);
				1289	if(spanLength>0) {
				1290	if(spanLength==pos) {
				1291	return 0; // Reached the start of the string.
				1292	}
				1293	// Match strings before this code point.
				1294	// There cannot be any decrements below it because UnicodeSet strings
				1295	// contain multiple code points.
				1296	pos-=spanLength;
				1297	offsets.shift(spanLength);
				1298	spanLength=0;
				1299	continue; // Match strings from before a single code point.
				1300	}
				1301	// Match strings from before the next string match.
				1302	}
				1303	}
				1304	pos-=offsets.popMinimum();
				1305	spanLength=0; // Match strings from before a string match.
				1306	}
				1307	}
				1308
				1309	/*
				1310	* Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
				1311	*
				1312	* Theoretical algorithm:
				1313	* - Iterate through the string, and at each code point boundary:
				1314	* + If the code point there is in the set, then return with the current position.
				1315	* + If a set string matches at the current position, then return with the current position.
				1316	*
				1317	* Optimized implementation:
				1318	*
				1319	* (Same assumption as for span() above.)
				1320	*
				1321	* Create and cache a spanNotSet which contains all of the single code points
				1322	* of the original set but none of its strings.
				1323	* For each set string add its initial code point to the spanNotSet.
				1324	* (Also add its final code point for spanNotBack().)
				1325	*
				1326	* - Loop:
				1327	* + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
				1328	* + If the current code point is in the original set, then
				1329	* return the current position.
				1330	* + If any set string matches at the current position, then
				1331	* return the current position.
				1332	* + If there is no match at the current position, neither for the code point there
				1333	* nor for any set string, then skip this code point and continue the loop.
				1334	* This happens for set-string-initial code points that were added to spanNotSet
				1335	* when there is not actually a match for such a set string.
				1336	*/
				1337
				1338	int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
				1339	int32_t pos=0, rest=length;
				1340	int32_t i, stringsLength=strings.size();
				1341	do {
				1342	// Span until we find a code point from the set,
				1343	// or a code point that starts or ends some string.
				1344	i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
				1345	if(i==rest) {
				1346	return length; // Reached the end of the string.
				1347	}
				1348	pos+=i;
				1349	rest-=i;
				1350
				1351	// Check whether the current code point is in the original set,
				1352	// without the string starts and ends.
				1353	int32_t cpLength=spanOne(spanSet, s+pos, rest);
				1354	if(cpLength>0) {
				1355	return pos; // There is a set element at pos.
				1356	}
				1357
				1358	// Try to match the strings at pos.
				1359	for(i=0; i<stringsLength; ++i) {
				1360	if(spanLengths[i]==ALL_CP_CONTAINED) {
				1361	continue; // Irrelevant string.
				1362	}
				1363	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				1364	const UChar *s16=string.getBuffer();
				1365	int32_t length16=string.length();
				1366	if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
				1367	return pos; // There is a set element at pos.
				1368	}
				1369	}
				1370
				1371	// The span(while not contained) ended on a string start/end which is
				1372	// not in the original set. Skip this code point and continue.
				1373	// cpLength<0
				1374	pos-=cpLength;
				1375	rest+=cpLength;
				1376	} while(rest!=0);
				1377	return length; // Reached the end of the string.
				1378	}
				1379
				1380	int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
				1381	int32_t pos=length;
				1382	int32_t i, stringsLength=strings.size();
				1383	do {
				1384	// Span until we find a code point from the set,
				1385	// or a code point that starts or ends some string.
				1386	pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
				1387	if(pos==0) {
				1388	return 0; // Reached the start of the string.
				1389	}
				1390
				1391	// Check whether the current code point is in the original set,
				1392	// without the string starts and ends.
				1393	int32_t cpLength=spanOneBack(spanSet, s, pos);
				1394	if(cpLength>0) {
				1395	return pos; // There is a set element at pos.
				1396	}
				1397
				1398	// Try to match the strings at pos.
				1399	for(i=0; i<stringsLength; ++i) {
				1400	// Use spanLengths rather than a spanBackLengths pointer because
				1401	// it is easier and we only need to know whether the string is irrelevant
				1402	// which is the same in either array.
				1403	if(spanLengths[i]==ALL_CP_CONTAINED) {
				1404	continue; // Irrelevant string.
				1405	}
				1406	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
				1407	const UChar *s16=string.getBuffer();
				1408	int32_t length16=string.length();
				1409	if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
				1410	return pos; // There is a set element at pos.
				1411	}
				1412	}
				1413
				1414	// The span(while not contained) ended on a string start/end which is
				1415	// not in the original set. Skip this code point and continue.
				1416	// cpLength<0
				1417	pos+=cpLength;
				1418	} while(pos!=0);
				1419	return 0; // Reached the start of the string.
				1420	}
				1421
				1422	int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
				1423	int32_t pos=0, rest=length;
				1424	int32_t i, stringsLength=strings.size();
				1425	uint8_t *spanUTF8Lengths=spanLengths;
				1426	if(all) {
				1427	spanUTF8Lengths+=2*stringsLength;
				1428	}
				1429	do {
				1430	// Span until we find a code point from the set,
				1431	// or a code point that starts or ends some string.
				1432	i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
				1433	if(i==rest) {
				1434	return length; // Reached the end of the string.
				1435	}
				1436	pos+=i;
				1437	rest-=i;
				1438
				1439	// Check whether the current code point is in the original set,
				1440	// without the string starts and ends.
				1441	int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
				1442	if(cpLength>0) {
				1443	return pos; // There is a set element at pos.
				1444	}
				1445
				1446	// Try to match the strings at pos.
				1447	const uint8_t *s8=utf8;
				1448	int32_t length8;
				1449	for(i=0; i<stringsLength; ++i) {
				1450	length8=utf8Lengths[i];
				1451	// ALL_CP_CONTAINED: Irrelevant string.
				1452	if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
				1453	return pos; // There is a set element at pos.
				1454	}
				1455	s8+=length8;
				1456	}
				1457
				1458	// The span(while not contained) ended on a string start/end which is
				1459	// not in the original set. Skip this code point and continue.
				1460	// cpLength<0
				1461	pos-=cpLength;
				1462	rest+=cpLength;
				1463	} while(rest!=0);
				1464	return length; // Reached the end of the string.
				1465	}
				1466
				1467	int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
				1468	int32_t pos=length;
				1469	int32_t i, stringsLength=strings.size();
				1470	uint8_t *spanBackUTF8Lengths=spanLengths;
				1471	if(all) {
				1472	spanBackUTF8Lengths+=3*stringsLength;
				1473	}
				1474	do {
				1475	// Span until we find a code point from the set,
				1476	// or a code point that starts or ends some string.
				1477	pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
				1478	if(pos==0) {
				1479	return 0; // Reached the start of the string.
				1480	}
				1481
				1482	// Check whether the current code point is in the original set,
				1483	// without the string starts and ends.
				1484	int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
				1485	if(cpLength>0) {
				1486	return pos; // There is a set element at pos.
				1487	}
				1488
				1489	// Try to match the strings at pos.
				1490	const uint8_t *s8=utf8;
				1491	int32_t length8;
				1492	for(i=0; i<stringsLength; ++i) {
				1493	length8=utf8Lengths[i];
				1494	// ALL_CP_CONTAINED: Irrelevant string.
				1495	if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
				1496	return pos; // There is a set element at pos.
				1497	}
				1498	s8+=length8;
				1499	}
				1500
				1501	// The span(while not contained) ended on a string start/end which is
				1502	// not in the original set. Skip this code point and continue.
				1503	// cpLength<0
				1504	pos+=cpLength;
				1505	} while(pos!=0);
				1506	return 0; // Reached the start of the string.
				1507	}
				1508
				1509	U_NAMESPACE_END