Blame - source/common/uniset.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 202427761680d663dcd72dff5e9dc6f80ff2239d [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	**********************************************************************
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	5	* Copyright (C) 1999-2015, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	* Date Name Description
				9	* 10/20/99 alan Creation.
				10	**********************************************************************
				11	*/
				12
				13	#include "unicode/utypes.h"
				14	#include "unicode/parsepos.h"
				15	#include "unicode/symtable.h"
				16	#include "unicode/uniset.h"
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	17	#include "unicode/ustring.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	18	#include "unicode/utf8.h"
				19	#include "unicode/utf16.h"
				20	#include "ruleiter.h"
				21	#include "cmemory.h"
				22	#include "cstring.h"
				23	#include "patternprops.h"
				24	#include "uelement.h"
				25	#include "util.h"
				26	#include "uvector.h"
				27	#include "charstr.h"
				28	#include "ustrfmt.h"
				29	#include "uassert.h"
				30	#include "bmpset.h"
				31	#include "unisetspan.h"
				32
				33	// Define UChar constants using hex for EBCDIC compatibility
				34	// Used #define to reduce private static exports and memory access time.
				35	#define SET_OPEN ((UChar)0x005B) /[/
				36	#define SET_CLOSE ((UChar)0x005D) /]/
				37	#define HYPHEN ((UChar)0x002D) /-/
				38	#define COMPLEMENT ((UChar)0x005E) /^/
				39	#define COLON ((UChar)0x003A) /:/
				40	#define BACKSLASH ((UChar)0x005C) /\/
				41	#define INTERSECTION ((UChar)0x0026) /&/
				42	#define UPPER_U ((UChar)0x0055) /U/
				43	#define LOWER_U ((UChar)0x0075) /u/
				44	#define OPEN_BRACE ((UChar)123) /{/
				45	#define CLOSE_BRACE ((UChar)125) /}/
				46	#define UPPER_P ((UChar)0x0050) /P/
				47	#define LOWER_P ((UChar)0x0070) /p/
				48	#define UPPER_N ((UChar)78) /N/
				49	#define EQUALS ((UChar)0x003D) /=/
				50
				51	// HIGH_VALUE > all valid values. 110000 for codepoints
				52	#define UNICODESET_HIGH 0x0110000
				53
				54	// LOW <= all valid values. ZERO for codepoints
				55	#define UNICODESET_LOW 0x000000
				56
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	57	/** Max list [0, 1, 2, ..., max code point, HIGH] */
				58	constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	59
				60	U_NAMESPACE_BEGIN
				61
				62	SymbolTable::~SymbolTable() {}
				63
				64	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
				65
				66	/**
				67	* Modify the given UChar32 variable so that it is in range, by
				68	* pinning values < UNICODESET_LOW to UNICODESET_LOW, and
				69	* pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
				70	* It modifies its argument in-place and also returns it.
				71	*/
				72	static inline UChar32 pinCodePoint(UChar32& c) {
				73	if (c < UNICODESET_LOW) {
				74	c = UNICODESET_LOW;
				75	} else if (c > (UNICODESET_HIGH-1)) {
				76	c = (UNICODESET_HIGH-1);
				77	}
				78	return c;
				79	}
				80
				81	//----------------------------------------------------------------
				82	// Debugging
				83	//----------------------------------------------------------------
				84
				85	// DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
				86	// To enable the debugging, define the symbol DEBUG_MEM in the line
				87	// below. This will result in text being sent to stdout that looks
				88	// like this:
				89	// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
				90	// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
				91	// Each line lists a construction (ct) or destruction (dt) event, the
				92	// object address, the number of outstanding objects after the event,
				93	// and the pattern of the object in question.
				94
				95	// #define DEBUG_MEM
				96
				97	#ifdef DEBUG_MEM
				98	#include <stdio.h>
				99	static int32_t _dbgCount = 0;
				100
				101	static inline void _dbgct(UnicodeSet* set) {
				102	UnicodeString str;
				103	set->toPattern(str, TRUE);
				104	char buf[40];
				105	str.extract(0, 39, buf, "");
				106	printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
				107	}
				108
				109	static inline void _dbgdt(UnicodeSet* set) {
				110	UnicodeString str;
				111	set->toPattern(str, TRUE);
				112	char buf[40];
				113	str.extract(0, 39, buf, "");
				114	printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
				115	}
				116
				117	#else
				118
				119	#define _dbgct(set)
				120	#define _dbgdt(set)
				121
				122	#endif
				123
				124	//----------------------------------------------------------------
				125	// UnicodeString in UVector support
				126	//----------------------------------------------------------------
				127
				128	static void U_CALLCONV cloneUnicodeString(UElement dst, UElement src) {
				129	dst->pointer = new UnicodeString((UnicodeString)src->pointer);
				130	}
				131
				132	static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
				133	const UnicodeString &a = (const UnicodeString)t1.pointer;
				134	const UnicodeString &b = (const UnicodeString)t2.pointer;
				135	return a.compare(b);
				136	}
				137
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	138	UBool UnicodeSet::hasStrings() const {
				139	return strings != nullptr && !strings->isEmpty();
				140	}
				141
				142	int32_t UnicodeSet::stringsSize() const {
				143	return strings == nullptr ? 0 : strings->size();
				144	}
				145
				146	UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
				147	return strings != nullptr && strings->contains((void*) &s);
				148	}
				149
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	150	//----------------------------------------------------------------
				151	// Constructors &c
				152	//----------------------------------------------------------------
				153
				154	/**
				155	* Constructs an empty set.
				156	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	157	UnicodeSet::UnicodeSet() {
				158	list[0] = UNICODESET_HIGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	159	_dbgct(this);
				160	}
				161
				162	/**
				163	* Constructs a set containing the given range. If <code>end >
				164	* start</code> then an empty set is created.
				165	*
				166	* @param start first character, inclusive, of range
				167	* @param end last character, inclusive, of range
				168	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	169	UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
				170	list[0] = UNICODESET_HIGH;
				171	add(start, end);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	172	_dbgct(this);
				173	}
				174
				175	/**
				176	* Constructs a set that is identical to the given UnicodeSet.
				177	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	178	UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
				179	*this = o;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	180	_dbgct(this);
				181	}
				182
				183	// Copy-construct as thawed.
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	184	UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
				185	if (ensureCapacity(o.len)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	186	// *this = o except for bmpSet and stringSpan
				187	len = o.len;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	188	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	189	if (o.hasStrings()) {
				190	UErrorCode status = U_ZERO_ERROR;
				191	if (!allocateStrings(status) \|\|
				192	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
				193	setToBogus();
				194	return;
				195	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	196	}
				197	if (o.pat) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	198	setPattern(o.pat, o.patLen);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	199	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	200	_dbgct(this);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	201	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	202	}
				203
				204	/**
				205	* Destructs the set.
				206	*/
				207	UnicodeSet::~UnicodeSet() {
				208	_dbgdt(this); // first!
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	209	if (list != stackList) {
				210	uprv_free(list);
				211	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	212	delete bmpSet;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	213	if (buffer != stackList) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	214	uprv_free(buffer);
				215	}
				216	delete strings;
				217	delete stringSpan;
				218	releasePattern();
				219	}
				220
				221	/**
				222	* Assigns this object to be a copy of another.
				223	*/
				224	UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	225	return copyFrom(o, FALSE);
				226	}
				227
				228	UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	229	if (this == &o) {
				230	return *this;
				231	}
				232	if (isFrozen()) {
				233	return *this;
				234	}
				235	if (o.isBogus()) {
				236	setToBogus();
				237	return *this;
				238	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	239	if (!ensureCapacity(o.len)) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	240	// ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
				241	return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	242	}
				243	len = o.len;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	244	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	245	if (o.bmpSet != nullptr && !asThawed) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	246	bmpSet = new BMPSet(*o.bmpSet, list, len);
				247	if (bmpSet == NULL) { // Check for memory allocation error.
				248	setToBogus();
				249	return *this;
				250	}
				251	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	252	if (o.hasStrings()) {
				253	UErrorCode status = U_ZERO_ERROR;
				254	if ((strings == nullptr && !allocateStrings(status)) \|\|
				255	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
				256	setToBogus();
				257	return *this;
				258	}
				259	} else if (hasStrings()) {
				260	strings->removeAllElements();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	261	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	262	if (o.stringSpan != nullptr && !asThawed) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	263	stringSpan = new UnicodeSetStringSpan(o.stringSpan, strings);
				264	if (stringSpan == NULL) { // Check for memory allocation error.
				265	setToBogus();
				266	return *this;
				267	}
				268	}
				269	releasePattern();
				270	if (o.pat) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	271	setPattern(o.pat, o.patLen);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	272	}
				273	return *this;
				274	}
				275
				276	/**
				277	* Returns a copy of this object. All UnicodeMatcher objects have
				278	* to support cloning in order to allow classes using
				279	* UnicodeMatchers, such as Transliterator, to implement cloning.
				280	*/
				281	UnicodeFunctor* UnicodeSet::clone() const {
				282	return new UnicodeSet(*this);
				283	}
				284
				285	UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
				286	return new UnicodeSet(*this, TRUE);
				287	}
				288
				289	/**
				290	* Compares the specified object with this set for equality. Returns
				291	* <tt>true</tt> if the two sets
				292	* have the same size, and every member of the specified set is
				293	* contained in this set (or equivalently, every member of this set is
				294	* contained in the specified set).
				295	*
				296	* @param o set to be compared for equality with this set.
				297	* @return <tt>true</tt> if the specified set is equal to this set.
				298	*/
				299	UBool UnicodeSet::operator==(const UnicodeSet& o) const {
				300	if (len != o.len) return FALSE;
				301	for (int32_t i = 0; i < len; ++i) {
				302	if (list[i] != o.list[i]) return FALSE;
				303	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	304	if (hasStrings() != o.hasStrings()) { return FALSE; }
				305	if (hasStrings() && strings != o.strings) return FALSE;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	306	return TRUE;
				307	}
				308
				309	/**
				310	* Returns the hash code value for this set.
				311	*
				312	* @return the hash code value for this set.
				313	* @see Object#hashCode()
				314	*/
				315	int32_t UnicodeSet::hashCode(void) const {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	316	uint32_t result = static_cast<uint32_t>(len);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	317	for (int32_t i = 0; i < len; ++i) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	318	result *= 1000003u;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	319	result += list[i];
				320	}
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	321	return static_cast<int32_t>(result);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	322	}
				323
				324	//----------------------------------------------------------------
				325	// Public API
				326	//----------------------------------------------------------------
				327
				328	/**
				329	* Returns the number of elements in this set (its cardinality),
				330	* Note than the elements of a set may include both individual
				331	* codepoints and strings.
				332	*
				333	* @return the number of elements in this set (its cardinality).
				334	*/
				335	int32_t UnicodeSet::size(void) const {
				336	int32_t n = 0;
				337	int32_t count = getRangeCount();
				338	for (int32_t i = 0; i < count; ++i) {
				339	n += getRangeEnd(i) - getRangeStart(i) + 1;
				340	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	341	return n + stringsSize();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	342	}
				343
				344	/**
				345	* Returns <tt>true</tt> if this set contains no elements.
				346	*
				347	* @return <tt>true</tt> if this set contains no elements.
				348	*/
				349	UBool UnicodeSet::isEmpty(void) const {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	350	return len == 1 && !hasStrings();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	351	}
				352
				353	/**
				354	* Returns true if this set contains the given character.
				355	* @param c character to be checked for containment
				356	* @return true if the test condition is met
				357	*/
				358	UBool UnicodeSet::contains(UChar32 c) const {
				359	// Set i to the index of the start item greater than ch
				360	// We know we will terminate without length test!
				361	// LATER: for large sets, add binary search
				362	//int32_t i = -1;
				363	//for (;;) {
				364	// if (c < list[++i]) break;
				365	//}
				366	if (bmpSet != NULL) {
				367	return bmpSet->contains(c);
				368	}
				369	if (stringSpan != NULL) {
				370	return stringSpan->contains(c);
				371	}
				372	if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
				373	return FALSE;
				374	}
				375	int32_t i = findCodePoint(c);
				376	return (UBool)(i & 1); // return true if odd
				377	}
				378
				379	/**
				380	* Returns the smallest value i such that c < list[i]. Caller
				381	* must ensure that c is a legal value or this method will enter
				382	* an infinite loop. This method performs a binary search.
				383	* @param c a character in the range MIN_VALUE..MAX_VALUE
				384	* inclusive
				385	* @return the smallest integer i in the range 0..len-1,
				386	* inclusive, such that c < list[i]
				387	*/
				388	int32_t UnicodeSet::findCodePoint(UChar32 c) const {
				389	/* Examples:
				390	findCodePoint(c)
				391	set list[] c=0 1 3 4 7 8
				392	=== ============== ===========
				393	[] [110000] 0 0 0 0 0 0
				394	[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
				395	[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
				396	[:Any:] [0, 110000] 1 1 1 1 1 1
				397	*/
				398
				399	// Return the smallest i such that c < list[i]. Assume
				400	// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
				401	if (c < list[0])
				402	return 0;
				403	// High runner test. c is often after the last range, so an
				404	// initial check for this condition pays off.
				405	int32_t lo = 0;
				406	int32_t hi = len - 1;
				407	if (lo >= hi \|\| c >= list[hi-1])
				408	return hi;
				409	// invariant: c >= list[lo]
				410	// invariant: c < list[hi]
				411	for (;;) {
				412	int32_t i = (lo + hi) >> 1;
				413	if (i == lo) {
				414	break; // Found!
				415	} else if (c < list[i]) {
				416	hi = i;
				417	} else {
				418	lo = i;
				419	}
				420	}
				421	return hi;
				422	}
				423
				424	/**
				425	* Returns true if this set contains every character
				426	* of the given range.
				427	* @param start first character, inclusive, of the range
				428	* @param end last character, inclusive, of the range
				429	* @return true if the test condition is met
				430	*/
				431	UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
				432	//int32_t i = -1;
				433	//for (;;) {
				434	// if (start < list[++i]) break;
				435	//}
				436	int32_t i = findCodePoint(start);
				437	return ((i & 1) != 0 && end < list[i]);
				438	}
				439
				440	/**
				441	* Returns <tt>true</tt> if this set contains the given
				442	* multicharacter string.
				443	* @param s string to be checked for containment
				444	* @return <tt>true</tt> if this set contains the specified string
				445	*/
				446	UBool UnicodeSet::contains(const UnicodeString& s) const {
				447	if (s.length() == 0) return FALSE;
				448	int32_t cp = getSingleCP(s);
				449	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	450	return stringsContains(s);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	451	} else {
				452	return contains((UChar32) cp);
				453	}
				454	}
				455
				456	/**
				457	* Returns true if this set contains all the characters and strings
				458	* of the given set.
				459	* @param c set to be checked for containment
				460	* @return true if the test condition is met
				461	*/
				462	UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
				463	// The specified set is a subset if all of its pairs are contained in
				464	// this set. It's possible to code this more efficiently in terms of
				465	// direct manipulation of the inversion lists if the need arises.
				466	int32_t n = c.getRangeCount();
				467	for (int i=0; i<n; ++i) {
				468	if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
				469	return FALSE;
				470	}
				471	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	472	return !c.hasStrings() \|\| (strings != nullptr && strings->containsAll(*c.strings));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	473	}
				474
				475	/**
				476	* Returns true if this set contains all the characters
				477	* of the given string.
				478	* @param s string containing characters to be checked for containment
				479	* @return true if the test condition is met
				480	*/
				481	UBool UnicodeSet::containsAll(const UnicodeString& s) const {
				482	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
				483	s.length());
				484	}
				485
				486	/**
				487	* Returns true if this set contains none of the characters
				488	* of the given range.
				489	* @param start first character, inclusive, of the range
				490	* @param end last character, inclusive, of the range
				491	* @return true if the test condition is met
				492	*/
				493	UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
				494	//int32_t i = -1;
				495	//for (;;) {
				496	// if (start < list[++i]) break;
				497	//}
				498	int32_t i = findCodePoint(start);
				499	return ((i & 1) == 0 && end < list[i]);
				500	}
				501
				502	/**
				503	* Returns true if this set contains none of the characters and strings
				504	* of the given set.
				505	* @param c set to be checked for containment
				506	* @return true if the test condition is met
				507	*/
				508	UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
				509	// The specified set is a subset if all of its pairs are contained in
				510	// this set. It's possible to code this more efficiently in terms of
				511	// direct manipulation of the inversion lists if the need arises.
				512	int32_t n = c.getRangeCount();
				513	for (int32_t i=0; i<n; ++i) {
				514	if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
				515	return FALSE;
				516	}
				517	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	518	return strings == nullptr \|\| !c.hasStrings() \|\| strings->containsNone(*c.strings);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	519	}
				520
				521	/**
				522	* Returns true if this set contains none of the characters
				523	* of the given string.
				524	* @param s string containing characters to be checked for containment
				525	* @return true if the test condition is met
				526	*/
				527	UBool UnicodeSet::containsNone(const UnicodeString& s) const {
				528	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
				529	s.length());
				530	}
				531
				532	/**
				533	* Returns <tt>true</tt> if this set contains any character whose low byte
				534	* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
				535	* indexing.
				536	*/
				537	UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
				538	/* The index value v, in the range [0,255], is contained in this set if
				539	* it is contained in any pair of this set. Pairs either have the high
				540	* bytes equal, or unequal. If the high bytes are equal, then we have
				541	* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
				542	* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
				543	* Then v is contained if xx <= v \|\| v <= yy. (This is identical to the
				544	* time zone month containment logic.)
				545	*/
				546	int32_t i;
				547	int32_t rangeCount=getRangeCount();
				548	for (i=0; i<rangeCount; ++i) {
				549	UChar32 low = getRangeStart(i);
				550	UChar32 high = getRangeEnd(i);
				551	if ((low & ~0xFF) == (high & ~0xFF)) {
				552	if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
				553	return TRUE;
				554	}
				555	} else if ((low & 0xFF) <= v \|\| v <= (high & 0xFF)) {
				556	return TRUE;
				557	}
				558	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	559	if (hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	560	for (i=0; i<strings->size(); ++i) {
				561	const UnicodeString& s = (const UnicodeString)strings->elementAt(i);
				562	//if (s.length() == 0) {
				563	// // Empty strings match everything
				564	// return TRUE;
				565	//}
				566	// assert(s.length() != 0); // We enforce this elsewhere
				567	UChar32 c = s.char32At(0);
				568	if ((c & 0xFF) == v) {
				569	return TRUE;
				570	}
				571	}
				572	}
				573	return FALSE;
				574	}
				575
				576	/**
				577	* Implementation of UnicodeMatcher::matches(). Always matches the
				578	* longest possible multichar string.
				579	*/
				580	UMatchDegree UnicodeSet::matches(const Replaceable& text,
				581	int32_t& offset,
				582	int32_t limit,
				583	UBool incremental) {
				584	if (offset == limit) {
				585	// Strings, if any, have length != 0, so we don't worry
				586	// about them here. If we ever allow zero-length strings
				587	// we much check for them here.
				588	if (contains(U_ETHER)) {
				589	return incremental ? U_PARTIAL_MATCH : U_MATCH;
				590	} else {
				591	return U_MISMATCH;
				592	}
				593	} else {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	594	if (hasStrings()) { // try strings first
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	595
				596	// might separate forward and backward loops later
				597	// for now they are combined
				598
				599	// TODO Improve efficiency of this, at least in the forward
				600	// direction, if not in both. In the forward direction we
				601	// can assume the strings are sorted.
				602
				603	int32_t i;
				604	UBool forward = offset < limit;
				605
				606	// firstChar is the leftmost char to match in the
				607	// forward direction or the rightmost char to match in
				608	// the reverse direction.
				609	UChar firstChar = text.charAt(offset);
				610
				611	// If there are multiple strings that can match we
				612	// return the longest match.
				613	int32_t highWaterLength = 0;
				614
				615	for (i=0; i<strings->size(); ++i) {
				616	const UnicodeString& trial = (const UnicodeString)strings->elementAt(i);
				617
				618	//if (trial.length() == 0) {
				619	// return U_MATCH; // null-string always matches
				620	//}
				621	// assert(trial.length() != 0); // We ensure this elsewhere
				622
				623	UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
				624
				625	// Strings are sorted, so we can optimize in the
				626	// forward direction.
				627	if (forward && c > firstChar) break;
				628	if (c != firstChar) continue;
				629
				630	int32_t matchLen = matchRest(text, offset, limit, trial);
				631
				632	if (incremental) {
				633	int32_t maxLen = forward ? limit-offset : offset-limit;
				634	if (matchLen == maxLen) {
				635	// We have successfully matched but only up to limit.
				636	return U_PARTIAL_MATCH;
				637	}
				638	}
				639
				640	if (matchLen == trial.length()) {
				641	// We have successfully matched the whole string.
				642	if (matchLen > highWaterLength) {
				643	highWaterLength = matchLen;
				644	}
				645	// In the forward direction we know strings
				646	// are sorted so we can bail early.
				647	if (forward && matchLen < highWaterLength) {
				648	break;
				649	}
				650	continue;
				651	}
				652	}
				653
				654	// We've checked all strings without a partial match.
				655	// If we have full matches, return the longest one.
				656	if (highWaterLength != 0) {
				657	offset += forward ? highWaterLength : -highWaterLength;
				658	return U_MATCH;
				659	}
				660	}
				661	return UnicodeFilter::matches(text, offset, limit, incremental);
				662	}
				663	}
				664
				665	/**
				666	* Returns the longest match for s in text at the given position.
				667	* If limit > start then match forward from start+1 to limit
				668	* matching all characters except s.charAt(0). If limit < start,
				669	* go backward starting from start-1 matching all characters
				670	* except s.charAt(s.length()-1). This method assumes that the
				671	* first character, text.charAt(start), matches s, so it does not
				672	* check it.
				673	* @param text the text to match
				674	* @param start the first character to match. In the forward
				675	* direction, text.charAt(start) is matched against s.charAt(0).
				676	* In the reverse direction, it is matched against
				677	* s.charAt(s.length()-1).
				678	* @param limit the limit offset for matching, either last+1 in
				679	* the forward direction, or last-1 in the reverse direction,
				680	* where last is the index of the last character to match.
				681	* @return If part of s matches up to the limit, return \|limit -
				682	* start\|. If all of s matches before reaching the limit, return
				683	* s.length(). If there is a mismatch between s and text, return
				684	* 0
				685	*/
				686	int32_t UnicodeSet::matchRest(const Replaceable& text,
				687	int32_t start, int32_t limit,
				688	const UnicodeString& s) {
				689	int32_t i;
				690	int32_t maxLen;
				691	int32_t slen = s.length();
				692	if (start < limit) {
				693	maxLen = limit - start;
				694	if (maxLen > slen) maxLen = slen;
				695	for (i = 1; i < maxLen; ++i) {
				696	if (text.charAt(start + i) != s.charAt(i)) return 0;
				697	}
				698	} else {
				699	maxLen = start - limit;
				700	if (maxLen > slen) maxLen = slen;
				701	--slen; // <=> slen = s.length() - 1;
				702	for (i = 1; i < maxLen; ++i) {
				703	if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
				704	}
				705	}
				706	return maxLen;
				707	}
				708
				709	/**
				710	* Implement of UnicodeMatcher
				711	*/
				712	void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
				713	toUnionTo.addAll(*this);
				714	}
				715
				716	/**
				717	* Returns the index of the given character within this set, where
				718	* the set is ordered by ascending code point. If the character
				719	* is not in this set, return -1. The inverse of this method is
				720	* <code>charAt()</code>.
				721	* @return an index from 0..size()-1, or -1
				722	*/
				723	int32_t UnicodeSet::indexOf(UChar32 c) const {
				724	if (c < MIN_VALUE \|\| c > MAX_VALUE) {
				725	return -1;
				726	}
				727	int32_t i = 0;
				728	int32_t n = 0;
				729	for (;;) {
				730	UChar32 start = list[i++];
				731	if (c < start) {
				732	return -1;
				733	}
				734	UChar32 limit = list[i++];
				735	if (c < limit) {
				736	return n + c - start;
				737	}
				738	n += limit - start;
				739	}
				740	}
				741
				742	/**
				743	* Returns the character at the given index within this set, where
				744	* the set is ordered by ascending code point. If the index is
				745	* out of range, return (UChar32)-1. The inverse of this method is
				746	* <code>indexOf()</code>.
				747	* @param index an index from 0..size()-1
				748	* @return the character at the given index, or (UChar32)-1.
				749	*/
				750	UChar32 UnicodeSet::charAt(int32_t index) const {
				751	if (index >= 0) {
				752	// len2 is the largest even integer <= len, that is, it is len
				753	// for even values and len-1 for odd values. With odd values
				754	// the last entry is UNICODESET_HIGH.
				755	int32_t len2 = len & ~1;
				756	for (int32_t i=0; i < len2;) {
				757	UChar32 start = list[i++];
				758	int32_t count = list[i++] - start;
				759	if (index < count) {
				760	return (UChar32)(start + index);
				761	}
				762	index -= count;
				763	}
				764	}
				765	return (UChar32)-1;
				766	}
				767
				768	/**
				769	* Make this object represent the range <code>start - end</code>.
				770	* If <code>end > start</code> then this object is set to an
				771	* an empty range.
				772	*
				773	* @param start first character in the set, inclusive
				774	* @rparam end last character in the set, inclusive
				775	*/
				776	UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
				777	clear();
				778	complement(start, end);
				779	return *this;
				780	}
				781
				782	/**
				783	* Adds the specified range to this set if it is not already
				784	* present. If this set already contains the specified range,
				785	* the call leaves this set unchanged. If <code>end > start</code>
				786	* then an empty range is added, leaving the set unchanged.
				787	*
				788	* @param start first character, inclusive, of range to be added
				789	* to this set.
				790	* @param end last character, inclusive, of range to be added
				791	* to this set.
				792	*/
				793	UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
				794	if (pinCodePoint(start) < pinCodePoint(end)) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	795	UChar32 limit = end + 1;
				796	// Fast path for adding a new range after the last one.
				797	// Odd list length: [..., lastStart, lastLimit, HIGH]
				798	if ((len & 1) != 0) {
				799	// If the list is empty, set lastLimit low enough to not be adjacent to 0.
				800	UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
				801	if (lastLimit <= start && !isFrozen() && !isBogus()) {
				802	if (lastLimit == start) {
				803	// Extend the last range.
				804	list[len - 2] = limit;
				805	if (limit == UNICODESET_HIGH) {
				806	--len;
				807	}
				808	} else {
				809	list[len - 1] = start;
				810	if (limit < UNICODESET_HIGH) {
				811	if (ensureCapacity(len + 2)) {
				812	list[len++] = limit;
				813	list[len++] = UNICODESET_HIGH;
				814	}
				815	} else { // limit == UNICODESET_HIGH
				816	if (ensureCapacity(len + 1)) {
				817	list[len++] = UNICODESET_HIGH;
				818	}
				819	}
				820	}
				821	releasePattern();
				822	return *this;
				823	}
				824	}
				825	// This is slow. Could be much faster using findCodePoint(start)
				826	// and modifying the list, dealing with adjacent & overlapping ranges.
				827	UChar32 range[3] = { start, limit, UNICODESET_HIGH };
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	828	add(range, 2, 0);
				829	} else if (start == end) {
				830	add(start);
				831	}
				832	return *this;
				833	}
				834
				835	// #define DEBUG_US_ADD
				836
				837	#ifdef DEBUG_US_ADD
				838	#include <stdio.h>
				839	void dump(UChar32 c) {
				840	if (c <= 0xFF) {
				841	printf("%c", (char)c);
				842	} else {
				843	printf("U+%04X", c);
				844	}
				845	}
				846	void dump(const UChar32* list, int32_t len) {
				847	printf("[");
				848	for (int32_t i=0; i<len; ++i) {
				849	if (i != 0) printf(", ");
				850	dump(list[i]);
				851	}
				852	printf("]");
				853	}
				854	#endif
				855
				856	/**
				857	* Adds the specified character to this set if it is not already
				858	* present. If this set already contains the specified character,
				859	* the call leaves this set unchanged.
				860	*/
				861	UnicodeSet& UnicodeSet::add(UChar32 c) {
				862	// find smallest i such that c < list[i]
				863	// if odd, then it is IN the set
				864	// if even, then it is OUT of the set
				865	int32_t i = findCodePoint(pinCodePoint(c));
				866
				867	// already in set?
				868	if ((i & 1) != 0 \|\| isFrozen() \|\| isBogus()) return *this;
				869
				870	// HIGH is 0x110000
				871	// assert(list[len-1] == HIGH);
				872
				873	// empty = [HIGH]
				874	// [start_0, limit_0, start_1, limit_1, HIGH]
				875
				876	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
				877	// ^
				878	// list[i]
				879
				880	// i == 0 means c is before the first range
				881
				882	#ifdef DEBUG_US_ADD
				883	printf("Add of ");
				884	dump(c);
				885	printf(" found at %d", i);
				886	printf(": ");
				887	dump(list, len);
				888	printf(" => ");
				889	#endif
				890
				891	if (c == list[i]-1) {
				892	// c is before start of next range
				893	list[i] = c;
				894	// if we touched the HIGH mark, then add a new one
				895	if (c == (UNICODESET_HIGH - 1)) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	896	if (!ensureCapacity(len+1)) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	897	// ensureCapacity will mark the object as Bogus if OOM failure happens.
				898	return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	899	}
				900	list[len++] = UNICODESET_HIGH;
				901	}
				902	if (i > 0 && c == list[i-1]) {
				903	// collapse adjacent ranges
				904
				905	// [..., start_k-1, c, c, limit_k, ..., HIGH]
				906	// ^
				907	// list[i]
				908
				909	//for (int32_t k=i-1; k<len-2; ++k) {
				910	// list[k] = list[k+2];
				911	//}
				912	UChar32* dst = list + i - 1;
				913	UChar32* src = dst + 2;
				914	UChar32* srclimit = list + len;
				915	while (src < srclimit) (dst++) = (src++);
				916
				917	len -= 2;
				918	}
				919	}
				920
				921	else if (i > 0 && c == list[i-1]) {
				922	// c is after end of prior range
				923	list[i-1]++;
				924	// no need to check for collapse here
				925	}
				926
				927	else {
				928	// At this point we know the new char is not adjacent to
				929	// any existing ranges, and it is not 10FFFF.
				930
				931
				932	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
				933	// ^
				934	// list[i]
				935
				936	// [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
				937	// ^
				938	// list[i]
				939
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	940	if (!ensureCapacity(len+2)) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	941	// ensureCapacity will mark the object as Bogus if OOM failure happens.
				942	return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	943	}
				944
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	945	UChar32 *p = list + i;
				946	uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	947	list[i] = c;
				948	list[i+1] = c+1;
				949	len += 2;
				950	}
				951
				952	#ifdef DEBUG_US_ADD
				953	dump(list, len);
				954	printf("\n");
				955
				956	for (i=1; i<len; ++i) {
				957	if (list[i] <= list[i-1]) {
				958	// Corrupt array!
				959	printf("ERROR: list has been corrupted\n");
				960	exit(1);
				961	}
				962	}
				963	#endif
				964
				965	releasePattern();
				966	return *this;
				967	}
				968
				969	/**
				970	* Adds the specified multicharacter to this set if it is not already
				971	* present. If this set already contains the multicharacter,
				972	* the call leaves this set unchanged.
				973	* Thus "ch" => {"ch"}
				974	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
				975	* @param s the source string
				976	* @return the modified set, for chaining
				977	*/
				978	UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
				979	if (s.length() == 0 \|\| isFrozen() \|\| isBogus()) return *this;
				980	int32_t cp = getSingleCP(s);
				981	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	982	if (!stringsContains(s)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	983	_add(s);
				984	releasePattern();
				985	}
				986	} else {
				987	add((UChar32)cp);
				988	}
				989	return *this;
				990	}
				991
				992	/**
				993	* Adds the given string, in order, to 'strings'. The given string
				994	* must have been checked by the caller to not be empty and to not
				995	* already be in 'strings'.
				996	*/
				997	void UnicodeSet::_add(const UnicodeString& s) {
				998	if (isFrozen() \|\| isBogus()) {
				999	return;
				1000	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1001	UErrorCode ec = U_ZERO_ERROR;
				1002	if (strings == nullptr && !allocateStrings(ec)) {
				1003	setToBogus();
				1004	return;
				1005	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1006	UnicodeString* t = new UnicodeString(s);
				1007	if (t == NULL) { // Check for memory allocation error.
				1008	setToBogus();
				1009	return;
				1010	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1011	strings->sortedInsert(t, compareUnicodeString, ec);
				1012	if (U_FAILURE(ec)) {
				1013	setToBogus();
				1014	delete t;
				1015	}
				1016	}
				1017
				1018	/**
				1019	* @return a code point IF the string consists of a single one.
				1020	* otherwise returns -1.
				1021	* @param string to test
				1022	*/
				1023	int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
				1024	//if (s.length() < 1) {
				1025	// throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
				1026	//}
				1027	if (s.length() > 2) return -1;
				1028	if (s.length() == 1) return s.charAt(0);
				1029
				1030	// at this point, len = 2
				1031	UChar32 cp = s.char32At(0);
				1032	if (cp > 0xFFFF) { // is surrogate pair
				1033	return cp;
				1034	}
				1035	return -1;
				1036	}
				1037
				1038	/**
				1039	* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
				1040	* If this set already any particular character, it has no effect on that character.
				1041	* @param the source string
				1042	* @return the modified set, for chaining
				1043	*/
				1044	UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
				1045	UChar32 cp;
				1046	for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
				1047	cp = s.char32At(i);
				1048	add(cp);
				1049	}
				1050	return *this;
				1051	}
				1052
				1053	/**
				1054	* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
				1055	* If this set already any particular character, it has no effect on that character.
				1056	* @param the source string
				1057	* @return the modified set, for chaining
				1058	*/
				1059	UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
				1060	UnicodeSet set;
				1061	set.addAll(s);
				1062	retainAll(set);
				1063	return *this;
				1064	}
				1065
				1066	/**
				1067	* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
				1068	* If this set already any particular character, it has no effect on that character.
				1069	* @param the source string
				1070	* @return the modified set, for chaining
				1071	*/
				1072	UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
				1073	UnicodeSet set;
				1074	set.addAll(s);
				1075	complementAll(set);
				1076	return *this;
				1077	}
				1078
				1079	/**
				1080	* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
				1081	* If this set already any particular character, it has no effect on that character.
				1082	* @param the source string
				1083	* @return the modified set, for chaining
				1084	*/
				1085	UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
				1086	UnicodeSet set;
				1087	set.addAll(s);
				1088	removeAll(set);
				1089	return *this;
				1090	}
				1091
				1092	UnicodeSet& UnicodeSet::removeAllStrings() {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1093	if (!isFrozen() && hasStrings()) {
				1094	strings->removeAllElements();
				1095	releasePattern();
				1096	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1097	return *this;
				1098	}
				1099
				1100
				1101	/**
				1102	* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
				1103	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
				1104	* @param the source string
				1105	* @return a newly created set containing the given string
				1106	*/
				1107	UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
				1108	UnicodeSet *set = new UnicodeSet();
				1109	if (set != NULL) { // Check for memory allocation error.
				1110	set->add(s);
				1111	}
				1112	return set;
				1113	}
				1114
				1115
				1116	/**
				1117	* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
				1118	* @param the source string
				1119	* @return a newly created set containing the given characters
				1120	*/
				1121	UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
				1122	UnicodeSet *set = new UnicodeSet();
				1123	if (set != NULL) { // Check for memory allocation error.
				1124	set->addAll(s);
				1125	}
				1126	return set;
				1127	}
				1128
				1129	/**
				1130	* Retain only the elements in this set that are contained in the
				1131	* specified range. If <code>end > start</code> then an empty range is
				1132	* retained, leaving the set empty.
				1133	*
				1134	* @param start first character, inclusive, of range to be retained
				1135	* to this set.
				1136	* @param end last character, inclusive, of range to be retained
				1137	* to this set.
				1138	*/
				1139	UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
				1140	if (pinCodePoint(start) <= pinCodePoint(end)) {
				1141	UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
				1142	retain(range, 2, 0);
				1143	} else {
				1144	clear();
				1145	}
				1146	return *this;
				1147	}
				1148
				1149	UnicodeSet& UnicodeSet::retain(UChar32 c) {
				1150	return retain(c, c);
				1151	}
				1152
				1153	/**
				1154	* Removes the specified range from this set if it is present.
				1155	* The set will not contain the specified range once the call
				1156	* returns. If <code>end > start</code> then an empty range is
				1157	* removed, leaving the set unchanged.
				1158	*
				1159	* @param start first character, inclusive, of range to be removed
				1160	* from this set.
				1161	* @param end last character, inclusive, of range to be removed
				1162	* from this set.
				1163	*/
				1164	UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
				1165	if (pinCodePoint(start) <= pinCodePoint(end)) {
				1166	UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
				1167	retain(range, 2, 2);
				1168	}
				1169	return *this;
				1170	}
				1171
				1172	/**
				1173	* Removes the specified character from this set if it is present.
				1174	* The set will not contain the specified range once the call
				1175	* returns.
				1176	*/
				1177	UnicodeSet& UnicodeSet::remove(UChar32 c) {
				1178	return remove(c, c);
				1179	}
				1180
				1181	/**
				1182	* Removes the specified string from this set if it is present.
				1183	* The set will not contain the specified character once the call
				1184	* returns.
				1185	* @param the source string
				1186	* @return the modified set, for chaining
				1187	*/
				1188	UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
				1189	if (s.length() == 0 \|\| isFrozen() \|\| isBogus()) return *this;
				1190	int32_t cp = getSingleCP(s);
				1191	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1192	if (strings != nullptr && strings->removeElement((void*) &s)) {
				1193	releasePattern();
				1194	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1195	} else {
				1196	remove((UChar32)cp, (UChar32)cp);
				1197	}
				1198	return *this;
				1199	}
				1200
				1201	/**
				1202	* Complements the specified range in this set. Any character in
				1203	* the range will be removed if it is in this set, or will be
				1204	* added if it is not in this set. If <code>end > start</code>
				1205	* then an empty range is xor'ed, leaving the set unchanged.
				1206	*
				1207	* @param start first character, inclusive, of range to be removed
				1208	* from this set.
				1209	* @param end last character, inclusive, of range to be removed
				1210	* from this set.
				1211	*/
				1212	UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
				1213	if (isFrozen() \|\| isBogus()) {
				1214	return *this;
				1215	}
				1216	if (pinCodePoint(start) <= pinCodePoint(end)) {
				1217	UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
				1218	exclusiveOr(range, 2, 0);
				1219	}
				1220	releasePattern();
				1221	return *this;
				1222	}
				1223
				1224	UnicodeSet& UnicodeSet::complement(UChar32 c) {
				1225	return complement(c, c);
				1226	}
				1227
				1228	/**
				1229	* This is equivalent to
				1230	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
				1231	*/
				1232	UnicodeSet& UnicodeSet::complement(void) {
				1233	if (isFrozen() \|\| isBogus()) {
				1234	return *this;
				1235	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1236	if (list[0] == UNICODESET_LOW) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1237	uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1238	--len;
				1239	} else {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1240	if (!ensureCapacity(len+1)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1241	return *this;
				1242	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1243	uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
				1244	list[0] = UNICODESET_LOW;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1245	++len;
				1246	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1247	releasePattern();
				1248	return *this;
				1249	}
				1250
				1251	/**
				1252	* Complement the specified string in this set.
				1253	* The set will not contain the specified string once the call
				1254	* returns.
				1255	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
				1256	* @param s the string to complement
				1257	* @return this object, for chaining
				1258	*/
				1259	UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
				1260	if (s.length() == 0 \|\| isFrozen() \|\| isBogus()) return *this;
				1261	int32_t cp = getSingleCP(s);
				1262	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1263	if (stringsContains(s)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1264	strings->removeElement((void*) &s);
				1265	} else {
				1266	_add(s);
				1267	}
				1268	releasePattern();
				1269	} else {
				1270	complement((UChar32)cp, (UChar32)cp);
				1271	}
				1272	return *this;
				1273	}
				1274
				1275	/**
				1276	* Adds all of the elements in the specified set to this set if
				1277	* they're not already present. This operation effectively
				1278	* modifies this set so that its value is the <i>union</i> of the two
				1279	* sets. The behavior of this operation is unspecified if the specified
				1280	* collection is modified while the operation is in progress.
				1281	*
				1282	* @param c set whose elements are to be added to this set.
				1283	* @see #add(char, char)
				1284	*/
				1285	UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
				1286	if ( c.len>0 && c.list!=NULL ) {
				1287	add(c.list, c.len, 0);
				1288	}
				1289
				1290	// Add strings in order
				1291	if ( c.strings!=NULL ) {
				1292	for (int32_t i=0; i<c.strings->size(); ++i) {
				1293	const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1294	if (!stringsContains(*s)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1295	_add(*s);
				1296	}
				1297	}
				1298	}
				1299	return *this;
				1300	}
				1301
				1302	/**
				1303	* Retains only the elements in this set that are contained in the
				1304	* specified set. In other words, removes from this set all of
				1305	* its elements that are not contained in the specified set. This
				1306	* operation effectively modifies this set so that its value is
				1307	* the <i>intersection</i> of the two sets.
				1308	*
				1309	* @param c set that defines which elements this set will retain.
				1310	*/
				1311	UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
				1312	if (isFrozen() \|\| isBogus()) {
				1313	return *this;
				1314	}
				1315	retain(c.list, c.len, 0);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1316	if (hasStrings()) {
				1317	if (!c.hasStrings()) {
				1318	strings->removeAllElements();
				1319	} else {
				1320	strings->retainAll(*c.strings);
				1321	}
				1322	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1323	return *this;
				1324	}
				1325
				1326	/**
				1327	* Removes from this set all of its elements that are contained in the
				1328	* specified set. This operation effectively modifies this
				1329	* set so that its value is the <i>asymmetric set difference</i> of
				1330	* the two sets.
				1331	*
				1332	* @param c set that defines which elements will be removed from
				1333	* this set.
				1334	*/
				1335	UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
				1336	if (isFrozen() \|\| isBogus()) {
				1337	return *this;
				1338	}
				1339	retain(c.list, c.len, 2);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1340	if (hasStrings() && c.hasStrings()) {
				1341	strings->removeAll(*c.strings);
				1342	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1343	return *this;
				1344	}
				1345
				1346	/**
				1347	* Complements in this set all elements contained in the specified
				1348	* set. Any character in the other set will be removed if it is
				1349	* in this set, or will be added if it is not in this set.
				1350	*
				1351	* @param c set that defines which elements will be xor'ed from
				1352	* this set.
				1353	*/
				1354	UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
				1355	if (isFrozen() \|\| isBogus()) {
				1356	return *this;
				1357	}
				1358	exclusiveOr(c.list, c.len, 0);
				1359
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1360	if (c.strings != nullptr) {
				1361	for (int32_t i=0; i<c.strings->size(); ++i) {
				1362	void* e = c.strings->elementAt(i);
				1363	if (strings == nullptr \|\| !strings->removeElement(e)) {
				1364	_add((const UnicodeString)e);
				1365	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1366	}
				1367	}
				1368	return *this;
				1369	}
				1370
				1371	/**
				1372	* Removes all of the elements from this set. This set will be
				1373	* empty after this call returns.
				1374	*/
				1375	UnicodeSet& UnicodeSet::clear(void) {
				1376	if (isFrozen()) {
				1377	return *this;
				1378	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1379	list[0] = UNICODESET_HIGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1380	len = 1;
				1381	releasePattern();
				1382	if (strings != NULL) {
				1383	strings->removeAllElements();
				1384	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1385	// Remove bogus
				1386	fFlags = 0;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1387	return *this;
				1388	}
				1389
				1390	/**
				1391	* Iteration method that returns the number of ranges contained in
				1392	* this set.
				1393	* @see #getRangeStart
				1394	* @see #getRangeEnd
				1395	*/
				1396	int32_t UnicodeSet::getRangeCount() const {
				1397	return len/2;
				1398	}
				1399
				1400	/**
				1401	* Iteration method that returns the first character in the
				1402	* specified range of this set.
				1403	* @see #getRangeCount
				1404	* @see #getRangeEnd
				1405	*/
				1406	UChar32 UnicodeSet::getRangeStart(int32_t index) const {
				1407	return list[index*2];
				1408	}
				1409
				1410	/**
				1411	* Iteration method that returns the last character in the
				1412	* specified range of this set.
				1413	* @see #getRangeStart
				1414	* @see #getRangeEnd
				1415	*/
				1416	UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
				1417	return list[index*2 + 1] - 1;
				1418	}
				1419
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1420	const UnicodeString* UnicodeSet::getString(int32_t index) const {
				1421	return (const UnicodeString*) strings->elementAt(index);
				1422	}
				1423
				1424	/**
				1425	* Reallocate this objects internal structures to take up the least
				1426	* possible space, without changing this object's value.
				1427	*/
				1428	UnicodeSet& UnicodeSet::compact() {
				1429	if (isFrozen() \|\| isBogus()) {
				1430	return *this;
				1431	}
				1432	// Delete buffer first to defragment memory less.
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1433	if (buffer != stackList) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1434	uprv_free(buffer);
				1435	buffer = NULL;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1436	bufferCapacity = 0;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1437	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1438	if (list == stackList) {
				1439	// pass
				1440	} else if (len <= INITIAL_CAPACITY) {
				1441	uprv_memcpy(stackList, list, len * sizeof(UChar32));
				1442	uprv_free(list);
				1443	list = stackList;
				1444	capacity = INITIAL_CAPACITY;
				1445	} else if ((len + 7) < capacity) {
				1446	// If we have more than a little unused capacity, shrink it to len.
				1447	UChar32* temp = (UChar32) uprv_realloc(list, sizeof(UChar32) len);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1448	if (temp) {
				1449	list = temp;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1450	capacity = len;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1451	}
				1452	// else what the heck happened?! We allocated less memory!
				1453	// Oh well. We'll keep our original array.
				1454	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1455	if (strings != nullptr && strings->isEmpty()) {
				1456	delete strings;
				1457	strings = nullptr;
				1458	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1459	return *this;
				1460	}
				1461
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1462	#ifdef DEBUG_SERIALIZE
				1463	#include <stdio.h>
				1464	#endif
				1465
				1466	/**
				1467	* Deserialize constructor.
				1468	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1469	UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
				1470	UErrorCode &ec) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1471
				1472	if(U_FAILURE(ec)) {
				1473	setToBogus();
				1474	return;
				1475	}
				1476
				1477	if( (serialization != kSerialized)
				1478	\|\| (data==NULL)
				1479	\|\| (dataLen < 1)) {
				1480	ec = U_ILLEGAL_ARGUMENT_ERROR;
				1481	setToBogus();
				1482	return;
				1483	}
				1484
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1485	// bmp?
				1486	int32_t headerSize = ((data[0]&0x8000)) ?2:1;
				1487	int32_t bmpLength = (headerSize==1)?data[0]:data[1];
				1488
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1489	int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1490	#ifdef DEBUG_SERIALIZE
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1491	printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1492	#endif
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1493	if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1494	return;
				1495	}
				1496	// copy bmp
				1497	int32_t i;
				1498	for(i = 0; i< bmpLength;i++) {
				1499	list[i] = data[i+headerSize];
				1500	#ifdef DEBUG_SERIALIZE
				1501	printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
				1502	#endif
				1503	}
				1504	// copy smp
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1505	for(i=bmpLength;i<newLength;i++) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1506	list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
				1507	((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
				1508	#ifdef DEBUG_SERIALIZE
				1509	printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
				1510	#endif
				1511	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1512	U_ASSERT(i == newLength);
				1513	if (i == 0 \|\| list[i - 1] != UNICODESET_HIGH) {
				1514	list[i++] = UNICODESET_HIGH;
				1515	}
				1516	len = i;
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1517	}
				1518
				1519
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1520	int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
				1521	int32_t bmpLength, length, destLength;
				1522
				1523	if (U_FAILURE(ec)) {
				1524	return 0;
				1525	}
				1526
				1527	if (destCapacity<0 \|\| (destCapacity>0 && dest==NULL)) {
				1528	ec=U_ILLEGAL_ARGUMENT_ERROR;
				1529	return 0;
				1530	}
				1531
				1532	/* count necessary 16-bit units */
				1533	length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
				1534	// assert(length>=0);
				1535	if (length==0) {
				1536	/* empty set */
				1537	if (destCapacity>0) {
				1538	*dest=0;
				1539	} else {
				1540	ec=U_BUFFER_OVERFLOW_ERROR;
				1541	}
				1542	return 1;
				1543	}
				1544	/* now length>0 */
				1545
				1546	if (this->list[length-1]<=0xffff) {
				1547	/* all BMP */
				1548	bmpLength=length;
				1549	} else if (this->list[0]>=0x10000) {
				1550	/* all supplementary */
				1551	bmpLength=0;
				1552	length*=2;
				1553	} else {
				1554	/* some BMP, some supplementary */
				1555	for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
				1556	length=bmpLength+2*(length-bmpLength);
				1557	}
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1558	#ifdef DEBUG_SERIALIZE
				1559	printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
				1560	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1561	/* length: number of 16-bit array units */
				1562	if (length>0x7fff) {
				1563	/* there are only 15 bits for the length in the first serialized word */
				1564	ec=U_INDEX_OUTOFBOUNDS_ERROR;
				1565	return 0;
				1566	}
				1567
				1568	/*
				1569	* total serialized length:
				1570	* number of 16-bit array units (length) +
				1571	* 1 length unit (always) +
				1572	* 1 bmpLength unit (if there are supplementary values)
				1573	*/
				1574	destLength=length+((length>bmpLength)?2:1);
				1575	if (destLength<=destCapacity) {
				1576	const UChar32 *p;
				1577	int32_t i;
				1578
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1579	#ifdef DEBUG_SERIALIZE
				1580	printf("writeHdr\n");
				1581	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1582	*dest=(uint16_t)length;
				1583	if (length>bmpLength) {
				1584	*dest\|=0x8000;
				1585	*++dest=(uint16_t)bmpLength;
				1586	}
				1587	++dest;
				1588
				1589	/* write the BMP part of the array */
				1590	p=this->list;
				1591	for (i=0; i<bmpLength; ++i) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1592	#ifdef DEBUG_SERIALIZE
				1593	printf("writebmp: %x\n", (int)*p);
				1594	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1595	dest++=(uint16_t)p++;
				1596	}
				1597
				1598	/* write the supplementary part of the array */
				1599	for (; i<length; i+=2) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1600	#ifdef DEBUG_SERIALIZE
				1601	printf("write32: %x\n", (int)*p);
				1602	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1603	dest++=(uint16_t)(p>>16);
				1604	dest++=(uint16_t)p++;
				1605	}
				1606	} else {
				1607	ec=U_BUFFER_OVERFLOW_ERROR;
				1608	}
				1609	return destLength;
				1610	}
				1611
				1612	//----------------------------------------------------------------
				1613	// Implementation: Utility methods
				1614	//----------------------------------------------------------------
				1615
				1616	/**
				1617	* Allocate our strings vector and return TRUE if successful.
				1618	*/
				1619	UBool UnicodeSet::allocateStrings(UErrorCode &status) {
				1620	if (U_FAILURE(status)) {
				1621	return FALSE;
				1622	}
				1623	strings = new UVector(uprv_deleteUObject,
				1624	uhash_compareUnicodeString, 1, status);
				1625	if (strings == NULL) { // Check for memory allocation error.
				1626	status = U_MEMORY_ALLOCATION_ERROR;
				1627	return FALSE;
				1628	}
				1629	if (U_FAILURE(status)) {
				1630	delete strings;
				1631	strings = NULL;
				1632	return FALSE;
				1633	}
				1634	return TRUE;
				1635	}
				1636
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1637	int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
				1638	// Grow exponentially to reduce the frequency of allocations.
				1639	if (minCapacity < INITIAL_CAPACITY) {
				1640	return minCapacity + INITIAL_CAPACITY;
				1641	} else if (minCapacity <= 2500) {
				1642	return 5 * minCapacity;
				1643	} else {
				1644	int32_t newCapacity = 2 * minCapacity;
				1645	if (newCapacity > MAX_LENGTH) {
				1646	newCapacity = MAX_LENGTH;
				1647	}
				1648	return newCapacity;
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	1649	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1650	}
				1651
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1652	bool UnicodeSet::ensureCapacity(int32_t newLen) {
				1653	if (newLen > MAX_LENGTH) {
				1654	newLen = MAX_LENGTH;
				1655	}
				1656	if (newLen <= capacity) {
				1657	return true;
				1658	}
				1659	int32_t newCapacity = nextCapacity(newLen);
				1660	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1661	if (temp == NULL) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1662	setToBogus(); // set the object to bogus state if an OOM failure occurred.
				1663	return false;
				1664	}
				1665	// Copy only the actual contents.
				1666	uprv_memcpy(temp, list, len * sizeof(UChar32));
				1667	if (list != stackList) {
				1668	uprv_free(list);
				1669	}
				1670	list = temp;
				1671	capacity = newCapacity;
				1672	return true;
				1673	}
				1674
				1675	bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
				1676	if (newLen > MAX_LENGTH) {
				1677	newLen = MAX_LENGTH;
				1678	}
				1679	if (newLen <= bufferCapacity) {
				1680	return true;
				1681	}
				1682	int32_t newCapacity = nextCapacity(newLen);
				1683	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
				1684	if (temp == NULL) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1685	setToBogus();
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1686	return false;
				1687	}
				1688	// The buffer has no contents to be copied.
				1689	// It is always filled from scratch after this call.
				1690	if (buffer != stackList) {
				1691	uprv_free(buffer);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1692	}
				1693	buffer = temp;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1694	bufferCapacity = newCapacity;
				1695	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1696	}
				1697
				1698	/**
				1699	* Swap list and buffer.
				1700	*/
				1701	void UnicodeSet::swapBuffers(void) {
				1702	// swap list and buffer
				1703	UChar32* temp = list;
				1704	list = buffer;
				1705	buffer = temp;
				1706
				1707	int32_t c = capacity;
				1708	capacity = bufferCapacity;
				1709	bufferCapacity = c;
				1710	}
				1711
				1712	void UnicodeSet::setToBogus() {
				1713	clear(); // Remove everything in the set.
				1714	fFlags = kIsBogus;
				1715	}
				1716
				1717	//----------------------------------------------------------------
				1718	// Implementation: Fundamental operators
				1719	//----------------------------------------------------------------
				1720
				1721	static inline UChar32 max(UChar32 a, UChar32 b) {
				1722	return (a > b) ? a : b;
				1723	}
				1724
				1725	// polarity = 0, 3 is normal: x xor y
				1726	// polarity = 1, 2: x xor ~y == x === y
				1727
				1728	void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
				1729	if (isFrozen() \|\| isBogus()) {
				1730	return;
				1731	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1732	if (!ensureBufferCapacity(len + otherLen)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1733	return;
				1734	}
				1735
				1736	int32_t i = 0, j = 0, k = 0;
				1737	UChar32 a = list[i++];
				1738	UChar32 b;
				1739	if (polarity == 1 \|\| polarity == 2) {
				1740	b = UNICODESET_LOW;
				1741	if (other[j] == UNICODESET_LOW) { // skip base if already LOW
				1742	++j;
				1743	b = other[j];
				1744	}
				1745	} else {
				1746	b = other[j++];
				1747	}
				1748	// simplest of all the routines
				1749	// sort the values, discarding identicals!
				1750	for (;;) {
				1751	if (a < b) {
				1752	buffer[k++] = a;
				1753	a = list[i++];
				1754	} else if (b < a) {
				1755	buffer[k++] = b;
				1756	b = other[j++];
				1757	} else if (a != UNICODESET_HIGH) { // at this point, a == b
				1758	// discard both values!
				1759	a = list[i++];
				1760	b = other[j++];
				1761	} else { // DONE!
				1762	buffer[k++] = UNICODESET_HIGH;
				1763	len = k;
				1764	break;
				1765	}
				1766	}
				1767	swapBuffers();
				1768	releasePattern();
				1769	}
				1770
				1771	// polarity = 0 is normal: x union y
				1772	// polarity = 2: x union ~y
				1773	// polarity = 1: ~x union y
				1774	// polarity = 3: ~x union ~y
				1775
				1776	void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
				1777	if (isFrozen() \|\| isBogus() \|\| other==NULL) {
				1778	return;
				1779	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1780	if (!ensureBufferCapacity(len + otherLen)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1781	return;
				1782	}
				1783
				1784	int32_t i = 0, j = 0, k = 0;
				1785	UChar32 a = list[i++];
				1786	UChar32 b = other[j++];
				1787	// change from xor is that we have to check overlapping pairs
				1788	// polarity bit 1 means a is second, bit 2 means b is.
				1789	for (;;) {
				1790	switch (polarity) {
				1791	case 0: // both first; take lower if unequal
				1792	if (a < b) { // take a
				1793	// Back up over overlapping ranges in buffer[]
				1794	if (k > 0 && a <= buffer[k-1]) {
				1795	// Pick latter end value in buffer[] vs. list[]
				1796	a = max(list[i], buffer[--k]);
				1797	} else {
				1798	// No overlap
				1799	buffer[k++] = a;
				1800	a = list[i];
				1801	}
				1802	i++; // Common if/else code factored out
				1803	polarity ^= 1;
				1804	} else if (b < a) { // take b
				1805	if (k > 0 && b <= buffer[k-1]) {
				1806	b = max(other[j], buffer[--k]);
				1807	} else {
				1808	buffer[k++] = b;
				1809	b = other[j];
				1810	}
				1811	j++;
				1812	polarity ^= 2;
				1813	} else { // a == b, take a, drop b
				1814	if (a == UNICODESET_HIGH) goto loop_end;
				1815	// This is symmetrical; it doesn't matter if
				1816	// we backtrack with a or b. - liu
				1817	if (k > 0 && a <= buffer[k-1]) {
				1818	a = max(list[i], buffer[--k]);
				1819	} else {
				1820	// No overlap
				1821	buffer[k++] = a;
				1822	a = list[i];
				1823	}
				1824	i++;
				1825	polarity ^= 1;
				1826	b = other[j++];
				1827	polarity ^= 2;
				1828	}
				1829	break;
				1830	case 3: // both second; take higher if unequal, and drop other
				1831	if (b <= a) { // take a
				1832	if (a == UNICODESET_HIGH) goto loop_end;
				1833	buffer[k++] = a;
				1834	} else { // take b
				1835	if (b == UNICODESET_HIGH) goto loop_end;
				1836	buffer[k++] = b;
				1837	}
				1838	a = list[i++];
				1839	polarity ^= 1; // factored common code
				1840	b = other[j++];
				1841	polarity ^= 2;
				1842	break;
				1843	case 1: // a second, b first; if b < a, overlap
				1844	if (a < b) { // no overlap, take a
				1845	buffer[k++] = a; a = list[i++]; polarity ^= 1;
				1846	} else if (b < a) { // OVERLAP, drop b
				1847	b = other[j++];
				1848	polarity ^= 2;
				1849	} else { // a == b, drop both!
				1850	if (a == UNICODESET_HIGH) goto loop_end;
				1851	a = list[i++];
				1852	polarity ^= 1;
				1853	b = other[j++];
				1854	polarity ^= 2;
				1855	}
				1856	break;
				1857	case 2: // a first, b second; if a < b, overlap
				1858	if (b < a) { // no overlap, take b
				1859	buffer[k++] = b;
				1860	b = other[j++];
				1861	polarity ^= 2;
				1862	} else if (a < b) { // OVERLAP, drop a
				1863	a = list[i++];
				1864	polarity ^= 1;
				1865	} else { // a == b, drop both!
				1866	if (a == UNICODESET_HIGH) goto loop_end;
				1867	a = list[i++];
				1868	polarity ^= 1;
				1869	b = other[j++];
				1870	polarity ^= 2;
				1871	}
				1872	break;
				1873	}
				1874	}
				1875	loop_end:
				1876	buffer[k++] = UNICODESET_HIGH; // terminate
				1877	len = k;
				1878	swapBuffers();
				1879	releasePattern();
				1880	}
				1881
				1882	// polarity = 0 is normal: x intersect y
				1883	// polarity = 2: x intersect ~y == set-minus
				1884	// polarity = 1: ~x intersect y
				1885	// polarity = 3: ~x intersect ~y
				1886
				1887	void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
				1888	if (isFrozen() \|\| isBogus()) {
				1889	return;
				1890	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	1891	if (!ensureBufferCapacity(len + otherLen)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1892	return;
				1893	}
				1894
				1895	int32_t i = 0, j = 0, k = 0;
				1896	UChar32 a = list[i++];
				1897	UChar32 b = other[j++];
				1898	// change from xor is that we have to check overlapping pairs
				1899	// polarity bit 1 means a is second, bit 2 means b is.
				1900	for (;;) {
				1901	switch (polarity) {
				1902	case 0: // both first; drop the smaller
				1903	if (a < b) { // drop a
				1904	a = list[i++];
				1905	polarity ^= 1;
				1906	} else if (b < a) { // drop b
				1907	b = other[j++];
				1908	polarity ^= 2;
				1909	} else { // a == b, take one, drop other
				1910	if (a == UNICODESET_HIGH) goto loop_end;
				1911	buffer[k++] = a;
				1912	a = list[i++];
				1913	polarity ^= 1;
				1914	b = other[j++];
				1915	polarity ^= 2;
				1916	}
				1917	break;
				1918	case 3: // both second; take lower if unequal
				1919	if (a < b) { // take a
				1920	buffer[k++] = a;
				1921	a = list[i++];
				1922	polarity ^= 1;
				1923	} else if (b < a) { // take b
				1924	buffer[k++] = b;
				1925	b = other[j++];
				1926	polarity ^= 2;
				1927	} else { // a == b, take one, drop other
				1928	if (a == UNICODESET_HIGH) goto loop_end;
				1929	buffer[k++] = a;
				1930	a = list[i++];
				1931	polarity ^= 1;
				1932	b = other[j++];
				1933	polarity ^= 2;
				1934	}
				1935	break;
				1936	case 1: // a second, b first;
				1937	if (a < b) { // NO OVERLAP, drop a
				1938	a = list[i++];
				1939	polarity ^= 1;
				1940	} else if (b < a) { // OVERLAP, take b
				1941	buffer[k++] = b;
				1942	b = other[j++];
				1943	polarity ^= 2;
				1944	} else { // a == b, drop both!
				1945	if (a == UNICODESET_HIGH) goto loop_end;
				1946	a = list[i++];
				1947	polarity ^= 1;
				1948	b = other[j++];
				1949	polarity ^= 2;
				1950	}
				1951	break;
				1952	case 2: // a first, b second; if a < b, overlap
				1953	if (b < a) { // no overlap, drop b
				1954	b = other[j++];
				1955	polarity ^= 2;
				1956	} else if (a < b) { // OVERLAP, take a
				1957	buffer[k++] = a;
				1958	a = list[i++];
				1959	polarity ^= 1;
				1960	} else { // a == b, drop both!
				1961	if (a == UNICODESET_HIGH) goto loop_end;
				1962	a = list[i++];
				1963	polarity ^= 1;
				1964	b = other[j++];
				1965	polarity ^= 2;
				1966	}
				1967	break;
				1968	}
				1969	}
				1970	loop_end:
				1971	buffer[k++] = UNICODESET_HIGH; // terminate
				1972	len = k;
				1973	swapBuffers();
				1974	releasePattern();
				1975	}
				1976
				1977	/**
				1978	* Append the <code>toPattern()</code> representation of a
				1979	* string to the given <code>StringBuffer</code>.
				1980	*/
				1981	void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
				1982	escapeUnprintable) {
				1983	UChar32 cp;
				1984	for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
				1985	_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
				1986	}
				1987	}
				1988
				1989	/**
				1990	* Append the <code>toPattern()</code> representation of a
				1991	* character to the given <code>StringBuffer</code>.
				1992	*/
				1993	void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
				1994	escapeUnprintable) {
				1995	if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
				1996	// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
				1997	// unprintable
				1998	if (ICU_Utility::escapeUnprintable(buf, c)) {
				1999	return;
				2000	}
				2001	}
				2002	// Okay to let ':' pass through
				2003	switch (c) {
				2004	case SET_OPEN:
				2005	case SET_CLOSE:
				2006	case HYPHEN:
				2007	case COMPLEMENT:
				2008	case INTERSECTION:
				2009	case BACKSLASH:
				2010	case OPEN_BRACE:
				2011	case CLOSE_BRACE:
				2012	case COLON:
				2013	case SymbolTable::SYMBOL_REF:
				2014	buf.append(BACKSLASH);
				2015	break;
				2016	default:
				2017	// Escape whitespace
				2018	if (PatternProps::isWhiteSpace(c)) {
				2019	buf.append(BACKSLASH);
				2020	}
				2021	break;
				2022	}
				2023	buf.append(c);
				2024	}
				2025
				2026	/**
				2027	* Append a string representation of this set to result. This will be
				2028	* a cleaned version of the string passed to applyPattern(), if there
				2029	* is one. Otherwise it will be generated.
				2030	*/
				2031	UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
				2032	UBool escapeUnprintable) const
				2033	{
				2034	if (pat != NULL) {
				2035	int32_t i;
				2036	int32_t backslashCount = 0;
				2037	for (i=0; i<patLen; ) {
				2038	UChar32 c;
				2039	U16_NEXT(pat, i, patLen, c);
				2040	if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
				2041	// If the unprintable character is preceded by an odd
				2042	// number of backslashes, then it has been escaped.
				2043	// Before unescaping it, we delete the final
				2044	// backslash.
				2045	if ((backslashCount % 2) == 1) {
				2046	result.truncate(result.length() - 1);
				2047	}
				2048	ICU_Utility::escapeUnprintable(result, c);
				2049	backslashCount = 0;
				2050	} else {
				2051	result.append(c);
				2052	if (c == BACKSLASH) {
				2053	++backslashCount;
				2054	} else {
				2055	backslashCount = 0;
				2056	}
				2057	}
				2058	}
				2059	return result;
				2060	}
				2061
				2062	return _generatePattern(result, escapeUnprintable);
				2063	}
				2064
				2065	/**
				2066	* Returns a string representation of this set. If the result of
				2067	* calling this function is passed to a UnicodeSet constructor, it
				2068	* will produce another set that is equal to this one.
				2069	*/
				2070	UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
				2071	UBool escapeUnprintable) const
				2072	{
				2073	result.truncate(0);
				2074	return _toPattern(result, escapeUnprintable);
				2075	}
				2076
				2077	/**
				2078	* Generate and append a string representation of this set to result.
				2079	* This does not use this.pat, the cleaned up copy of the string
				2080	* passed to applyPattern().
				2081	*/
				2082	UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
				2083	UBool escapeUnprintable) const
				2084	{
				2085	result.append(SET_OPEN);
				2086
				2087	// // Check against the predefined categories. We implicitly build
				2088	// // up ALL category sets the first time toPattern() is called.
				2089	// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
				2090	// if (*this == getCategorySet(cat)) {
				2091	// result.append(COLON);
				2092	// result.append(CATEGORY_NAMES, cat*2, 2);
				2093	// return result.append(CATEGORY_CLOSE);
				2094	// }
				2095	// }
				2096
				2097	int32_t count = getRangeCount();
				2098
				2099	// If the set contains at least 2 intervals and includes both
				2100	// MIN_VALUE and MAX_VALUE, then the inverse representation will
				2101	// be more economical.
				2102	if (count > 1 &&
				2103	getRangeStart(0) == MIN_VALUE &&
				2104	getRangeEnd(count-1) == MAX_VALUE) {
				2105
				2106	// Emit the inverse
				2107	result.append(COMPLEMENT);
				2108
				2109	for (int32_t i = 1; i < count; ++i) {
				2110	UChar32 start = getRangeEnd(i-1)+1;
				2111	UChar32 end = getRangeStart(i)-1;
				2112	_appendToPat(result, start, escapeUnprintable);
				2113	if (start != end) {
				2114	if ((start+1) != end) {
				2115	result.append(HYPHEN);
				2116	}
				2117	_appendToPat(result, end, escapeUnprintable);
				2118	}
				2119	}
				2120	}
				2121
				2122	// Default; emit the ranges as pairs
				2123	else {
				2124	for (int32_t i = 0; i < count; ++i) {
				2125	UChar32 start = getRangeStart(i);
				2126	UChar32 end = getRangeEnd(i);
				2127	_appendToPat(result, start, escapeUnprintable);
				2128	if (start != end) {
				2129	if ((start+1) != end) {
				2130	result.append(HYPHEN);
				2131	}
				2132	_appendToPat(result, end, escapeUnprintable);
				2133	}
				2134	}
				2135	}
				2136
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2137	if (strings != nullptr) {
				2138	for (int32_t i = 0; i<strings->size(); ++i) {
				2139	result.append(OPEN_BRACE);
				2140	_appendToPat(result,
				2141	(const UnicodeString) strings->elementAt(i),
				2142	escapeUnprintable);
				2143	result.append(CLOSE_BRACE);
				2144	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2145	}
				2146	return result.append(SET_CLOSE);
				2147	}
				2148
				2149	/**
				2150	* Release existing cached pattern
				2151	*/
				2152	void UnicodeSet::releasePattern() {
				2153	if (pat) {
				2154	uprv_free(pat);
				2155	pat = NULL;
				2156	patLen = 0;
				2157	}
				2158	}
				2159
				2160	/**
				2161	* Set the new pattern to cache.
				2162	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2163	void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2164	releasePattern();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2165	pat = (UChar )uprv_malloc((newPatLen + 1) sizeof(UChar));
				2166	if (pat) {
				2167	patLen = newPatLen;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2168	u_memcpy(pat, newPat, patLen);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2169	pat[patLen] = 0;
				2170	}
				2171	// else we don't care if malloc failed. This was just a nice cache.
				2172	// We can regenerate an equivalent pattern later when requested.
				2173	}
				2174
				2175	UnicodeFunctor *UnicodeSet::freeze() {
				2176	if(!isFrozen() && !isBogus()) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2177	compact();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2178
				2179	// Optimize contains() and span() and similar functions.
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2180	if (hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2181	stringSpan = new UnicodeSetStringSpan(this, strings, UnicodeSetStringSpan::ALL);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2182	if (stringSpan == nullptr) {
				2183	setToBogus();
				2184	return this;
				2185	} else if (!stringSpan->needsStringSpanUTF16()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2186	// All strings are irrelevant for span() etc. because
				2187	// all of each string's code points are contained in this set.
				2188	// Do not check needsStringSpanUTF8() because UTF-8 has at most as
				2189	// many relevant strings as UTF-16.
				2190	// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
				2191	delete stringSpan;
				2192	stringSpan = NULL;
				2193	}
				2194	}
				2195	if (stringSpan == NULL) {
				2196	// No span-relevant strings: Optimize for code point spans.
				2197	bmpSet=new BMPSet(list, len);
				2198	if (bmpSet == NULL) { // Check for memory allocation error.
				2199	setToBogus();
				2200	}
				2201	}
				2202	}
				2203	return this;
				2204	}
				2205
				2206	int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
				2207	if(length>0 && bmpSet!=NULL) {
				2208	return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
				2209	}
				2210	if(length<0) {
				2211	length=u_strlen(s);
				2212	}
				2213	if(length==0) {
				2214	return 0;
				2215	}
				2216	if(stringSpan!=NULL) {
				2217	return stringSpan->span(s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2218	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2219	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2220	UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
				2221	UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
				2222	UnicodeSetStringSpan strSpan(this, strings, which);
				2223	if(strSpan.needsStringSpanUTF16()) {
				2224	return strSpan.span(s, length, spanCondition);
				2225	}
				2226	}
				2227
				2228	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2229	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2230	}
				2231
				2232	UChar32 c;
				2233	int32_t start=0, prev=0;
				2234	do {
				2235	U16_NEXT(s, start, length, c);
				2236	if(spanCondition!=contains(c)) {
				2237	break;
				2238	}
				2239	} while((prev=start)<length);
				2240	return prev;
				2241	}
				2242
				2243	int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
				2244	if(length>0 && bmpSet!=NULL) {
				2245	return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
				2246	}
				2247	if(length<0) {
				2248	length=u_strlen(s);
				2249	}
				2250	if(length==0) {
				2251	return 0;
				2252	}
				2253	if(stringSpan!=NULL) {
				2254	return stringSpan->spanBack(s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2255	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2256	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2257	UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
				2258	UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
				2259	UnicodeSetStringSpan strSpan(this, strings, which);
				2260	if(strSpan.needsStringSpanUTF16()) {
				2261	return strSpan.spanBack(s, length, spanCondition);
				2262	}
				2263	}
				2264
				2265	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2266	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2267	}
				2268
				2269	UChar32 c;
				2270	int32_t prev=length;
				2271	do {
				2272	U16_PREV(s, 0, length, c);
				2273	if(spanCondition!=contains(c)) {
				2274	break;
				2275	}
				2276	} while((prev=length)>0);
				2277	return prev;
				2278	}
				2279
				2280	int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
				2281	if(length>0 && bmpSet!=NULL) {
				2282	const uint8_t s0=(const uint8_t )s;
				2283	return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
				2284	}
				2285	if(length<0) {
				2286	length=(int32_t)uprv_strlen(s);
				2287	}
				2288	if(length==0) {
				2289	return 0;
				2290	}
				2291	if(stringSpan!=NULL) {
				2292	return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2293	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2294	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2295	UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
				2296	UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
				2297	UnicodeSetStringSpan strSpan(this, strings, which);
				2298	if(strSpan.needsStringSpanUTF8()) {
				2299	return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
				2300	}
				2301	}
				2302
				2303	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2304	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2305	}
				2306
				2307	UChar32 c;
				2308	int32_t start=0, prev=0;
				2309	do {
				2310	U8_NEXT_OR_FFFD(s, start, length, c);
				2311	if(spanCondition!=contains(c)) {
				2312	break;
				2313	}
				2314	} while((prev=start)<length);
				2315	return prev;
				2316	}
				2317
				2318	int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
				2319	if(length>0 && bmpSet!=NULL) {
				2320	const uint8_t s0=(const uint8_t )s;
				2321	return bmpSet->spanBackUTF8(s0, length, spanCondition);
				2322	}
				2323	if(length<0) {
				2324	length=(int32_t)uprv_strlen(s);
				2325	}
				2326	if(length==0) {
				2327	return 0;
				2328	}
				2329	if(stringSpan!=NULL) {
				2330	return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame^]	2331	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2332	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2333	UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
				2334	UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
				2335	UnicodeSetStringSpan strSpan(this, strings, which);
				2336	if(strSpan.needsStringSpanUTF8()) {
				2337	return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
				2338	}
				2339	}
				2340
				2341	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2342	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2343	}
				2344
				2345	UChar32 c;
				2346	int32_t prev=length;
				2347	do {
				2348	U8_PREV_OR_FFFD(s, 0, length, c);
				2349	if(spanCondition!=contains(c)) {
				2350	break;
				2351	}
				2352	} while((prev=length)>0);
				2353	return prev;
				2354	}
				2355
				2356	U_NAMESPACE_END