Blame - source/common/uniset.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 4faace525c595b6495b213e459bd7303d7166167 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	**********************************************************************
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	5	* Copyright (C) 1999-2015, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	* Date Name Description
				9	* 10/20/99 alan Creation.
				10	**********************************************************************
				11	*/
				12
				13	#include "unicode/utypes.h"
				14	#include "unicode/parsepos.h"
				15	#include "unicode/symtable.h"
				16	#include "unicode/uniset.h"
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	17	#include "unicode/ustring.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	18	#include "unicode/utf8.h"
				19	#include "unicode/utf16.h"
				20	#include "ruleiter.h"
				21	#include "cmemory.h"
				22	#include "cstring.h"
				23	#include "patternprops.h"
				24	#include "uelement.h"
				25	#include "util.h"
				26	#include "uvector.h"
				27	#include "charstr.h"
				28	#include "ustrfmt.h"
				29	#include "uassert.h"
				30	#include "bmpset.h"
				31	#include "unisetspan.h"
				32
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	33	// HIGH_VALUE > all valid values. 110000 for codepoints
				34	#define UNICODESET_HIGH 0x0110000
				35
				36	// LOW <= all valid values. ZERO for codepoints
				37	#define UNICODESET_LOW 0x000000
				38
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	39	/** Max list [0, 1, 2, ..., max code point, HIGH] */
				40	constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	41
				42	U_NAMESPACE_BEGIN
				43
				44	SymbolTable::~SymbolTable() {}
				45
				46	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
				47
				48	/**
				49	* Modify the given UChar32 variable so that it is in range, by
				50	* pinning values < UNICODESET_LOW to UNICODESET_LOW, and
				51	* pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
				52	* It modifies its argument in-place and also returns it.
				53	*/
				54	static inline UChar32 pinCodePoint(UChar32& c) {
				55	if (c < UNICODESET_LOW) {
				56	c = UNICODESET_LOW;
				57	} else if (c > (UNICODESET_HIGH-1)) {
				58	c = (UNICODESET_HIGH-1);
				59	}
				60	return c;
				61	}
				62
				63	//----------------------------------------------------------------
				64	// Debugging
				65	//----------------------------------------------------------------
				66
				67	// DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
				68	// To enable the debugging, define the symbol DEBUG_MEM in the line
				69	// below. This will result in text being sent to stdout that looks
				70	// like this:
				71	// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
				72	// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
				73	// Each line lists a construction (ct) or destruction (dt) event, the
				74	// object address, the number of outstanding objects after the event,
				75	// and the pattern of the object in question.
				76
				77	// #define DEBUG_MEM
				78
				79	#ifdef DEBUG_MEM
				80	#include <stdio.h>
				81	static int32_t _dbgCount = 0;
				82
				83	static inline void _dbgct(UnicodeSet* set) {
				84	UnicodeString str;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	85	set->toPattern(str, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	86	char buf[40];
				87	str.extract(0, 39, buf, "");
				88	printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
				89	}
				90
				91	static inline void _dbgdt(UnicodeSet* set) {
				92	UnicodeString str;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	93	set->toPattern(str, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	94	char buf[40];
				95	str.extract(0, 39, buf, "");
				96	printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
				97	}
				98
				99	#else
				100
				101	#define _dbgct(set)
				102	#define _dbgdt(set)
				103
				104	#endif
				105
				106	//----------------------------------------------------------------
				107	// UnicodeString in UVector support
				108	//----------------------------------------------------------------
				109
				110	static void U_CALLCONV cloneUnicodeString(UElement dst, UElement src) {
				111	dst->pointer = new UnicodeString((UnicodeString)src->pointer);
				112	}
				113
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	114	static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	115	const UnicodeString &a = (const UnicodeString)t1.pointer;
				116	const UnicodeString &b = (const UnicodeString)t2.pointer;
				117	return a.compare(b);
				118	}
				119
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	120	UBool UnicodeSet::hasStrings() const {
				121	return strings != nullptr && !strings->isEmpty();
				122	}
				123
				124	int32_t UnicodeSet::stringsSize() const {
				125	return strings == nullptr ? 0 : strings->size();
				126	}
				127
				128	UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
				129	return strings != nullptr && strings->contains((void*) &s);
				130	}
				131
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	132	//----------------------------------------------------------------
				133	// Constructors &c
				134	//----------------------------------------------------------------
				135
				136	/**
				137	* Constructs an empty set.
				138	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	139	UnicodeSet::UnicodeSet() {
				140	list[0] = UNICODESET_HIGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	141	_dbgct(this);
				142	}
				143
				144	/**
				145	* Constructs a set containing the given range. If <code>end >
				146	* start</code> then an empty set is created.
				147	*
				148	* @param start first character, inclusive, of range
				149	* @param end last character, inclusive, of range
				150	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	151	UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
				152	list[0] = UNICODESET_HIGH;
				153	add(start, end);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	154	_dbgct(this);
				155	}
				156
				157	/**
				158	* Constructs a set that is identical to the given UnicodeSet.
				159	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	160	UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
				161	*this = o;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	162	_dbgct(this);
				163	}
				164
				165	// Copy-construct as thawed.
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	166	UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
				167	if (ensureCapacity(o.len)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	168	// *this = o except for bmpSet and stringSpan
				169	len = o.len;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	170	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	171	if (o.hasStrings()) {
				172	UErrorCode status = U_ZERO_ERROR;
				173	if (!allocateStrings(status) \|\|
				174	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
				175	setToBogus();
				176	return;
				177	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	178	}
				179	if (o.pat) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	180	setPattern(o.pat, o.patLen);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	181	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	182	_dbgct(this);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	183	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	184	}
				185
				186	/**
				187	* Destructs the set.
				188	*/
				189	UnicodeSet::~UnicodeSet() {
				190	_dbgdt(this); // first!
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	191	if (list != stackList) {
				192	uprv_free(list);
				193	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	194	delete bmpSet;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	195	if (buffer != stackList) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	196	uprv_free(buffer);
				197	}
				198	delete strings;
				199	delete stringSpan;
				200	releasePattern();
				201	}
				202
				203	/**
				204	* Assigns this object to be a copy of another.
				205	*/
				206	UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	207	return copyFrom(o, false);
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	208	}
				209
				210	UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	211	if (this == &o) {
				212	return *this;
				213	}
				214	if (isFrozen()) {
				215	return *this;
				216	}
				217	if (o.isBogus()) {
				218	setToBogus();
				219	return *this;
				220	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	221	if (!ensureCapacity(o.len)) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	222	// ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
				223	return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	224	}
				225	len = o.len;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	226	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	227	if (o.bmpSet != nullptr && !asThawed) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	228	bmpSet = new BMPSet(*o.bmpSet, list, len);
				229	if (bmpSet == NULL) { // Check for memory allocation error.
				230	setToBogus();
				231	return *this;
				232	}
				233	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	234	if (o.hasStrings()) {
				235	UErrorCode status = U_ZERO_ERROR;
				236	if ((strings == nullptr && !allocateStrings(status)) \|\|
				237	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
				238	setToBogus();
				239	return *this;
				240	}
				241	} else if (hasStrings()) {
				242	strings->removeAllElements();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	243	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	244	if (o.stringSpan != nullptr && !asThawed) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	245	stringSpan = new UnicodeSetStringSpan(o.stringSpan, strings);
				246	if (stringSpan == NULL) { // Check for memory allocation error.
				247	setToBogus();
				248	return *this;
				249	}
				250	}
				251	releasePattern();
				252	if (o.pat) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	253	setPattern(o.pat, o.patLen);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	254	}
				255	return *this;
				256	}
				257
				258	/**
				259	* Returns a copy of this object. All UnicodeMatcher objects have
				260	* to support cloning in order to allow classes using
				261	* UnicodeMatchers, such as Transliterator, to implement cloning.
				262	*/
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	263	UnicodeSet* UnicodeSet::clone() const {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	264	return new UnicodeSet(*this);
				265	}
				266
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	267	UnicodeSet *UnicodeSet::cloneAsThawed() const {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	268	return new UnicodeSet(*this, true);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	269	}
				270
				271	/**
				272	* Compares the specified object with this set for equality. Returns
				273	* <tt>true</tt> if the two sets
				274	* have the same size, and every member of the specified set is
				275	* contained in this set (or equivalently, every member of this set is
				276	* contained in the specified set).
				277	*
				278	* @param o set to be compared for equality with this set.
				279	* @return <tt>true</tt> if the specified set is equal to this set.
				280	*/
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	281	bool UnicodeSet::operator==(const UnicodeSet& o) const {
				282	if (len != o.len) return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	283	for (int32_t i = 0; i < len; ++i) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	284	if (list[i] != o.list[i]) return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	285	}
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	286	if (hasStrings() != o.hasStrings()) { return false; }
				287	if (hasStrings() && strings != o.strings) return false;
				288	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	289	}
				290
				291	/**
				292	* Returns the hash code value for this set.
				293	*
				294	* @return the hash code value for this set.
				295	* @see Object#hashCode()
				296	*/
				297	int32_t UnicodeSet::hashCode(void) const {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	298	uint32_t result = static_cast<uint32_t>(len);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	299	for (int32_t i = 0; i < len; ++i) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	300	result *= 1000003u;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	301	result += list[i];
				302	}
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	303	return static_cast<int32_t>(result);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	304	}
				305
				306	//----------------------------------------------------------------
				307	// Public API
				308	//----------------------------------------------------------------
				309
				310	/**
				311	* Returns the number of elements in this set (its cardinality),
				312	* Note than the elements of a set may include both individual
				313	* codepoints and strings.
				314	*
				315	* @return the number of elements in this set (its cardinality).
				316	*/
				317	int32_t UnicodeSet::size(void) const {
				318	int32_t n = 0;
				319	int32_t count = getRangeCount();
				320	for (int32_t i = 0; i < count; ++i) {
				321	n += getRangeEnd(i) - getRangeStart(i) + 1;
				322	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	323	return n + stringsSize();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	324	}
				325
				326	/**
				327	* Returns <tt>true</tt> if this set contains no elements.
				328	*
				329	* @return <tt>true</tt> if this set contains no elements.
				330	*/
				331	UBool UnicodeSet::isEmpty(void) const {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	332	return len == 1 && !hasStrings();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	333	}
				334
				335	/**
				336	* Returns true if this set contains the given character.
				337	* @param c character to be checked for containment
				338	* @return true if the test condition is met
				339	*/
				340	UBool UnicodeSet::contains(UChar32 c) const {
				341	// Set i to the index of the start item greater than ch
				342	// We know we will terminate without length test!
				343	// LATER: for large sets, add binary search
				344	//int32_t i = -1;
				345	//for (;;) {
				346	// if (c < list[++i]) break;
				347	//}
				348	if (bmpSet != NULL) {
				349	return bmpSet->contains(c);
				350	}
				351	if (stringSpan != NULL) {
				352	return stringSpan->contains(c);
				353	}
				354	if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	355	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	356	}
				357	int32_t i = findCodePoint(c);
				358	return (UBool)(i & 1); // return true if odd
				359	}
				360
				361	/**
				362	* Returns the smallest value i such that c < list[i]. Caller
				363	* must ensure that c is a legal value or this method will enter
				364	* an infinite loop. This method performs a binary search.
				365	* @param c a character in the range MIN_VALUE..MAX_VALUE
				366	* inclusive
				367	* @return the smallest integer i in the range 0..len-1,
				368	* inclusive, such that c < list[i]
				369	*/
				370	int32_t UnicodeSet::findCodePoint(UChar32 c) const {
				371	/* Examples:
				372	findCodePoint(c)
				373	set list[] c=0 1 3 4 7 8
				374	=== ============== ===========
				375	[] [110000] 0 0 0 0 0 0
				376	[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
				377	[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
				378	[:Any:] [0, 110000] 1 1 1 1 1 1
				379	*/
				380
				381	// Return the smallest i such that c < list[i]. Assume
				382	// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
				383	if (c < list[0])
				384	return 0;
				385	// High runner test. c is often after the last range, so an
				386	// initial check for this condition pays off.
				387	int32_t lo = 0;
				388	int32_t hi = len - 1;
				389	if (lo >= hi \|\| c >= list[hi-1])
				390	return hi;
				391	// invariant: c >= list[lo]
				392	// invariant: c < list[hi]
				393	for (;;) {
				394	int32_t i = (lo + hi) >> 1;
				395	if (i == lo) {
				396	break; // Found!
				397	} else if (c < list[i]) {
				398	hi = i;
				399	} else {
				400	lo = i;
				401	}
				402	}
				403	return hi;
				404	}
				405
				406	/**
				407	* Returns true if this set contains every character
				408	* of the given range.
				409	* @param start first character, inclusive, of the range
				410	* @param end last character, inclusive, of the range
				411	* @return true if the test condition is met
				412	*/
				413	UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
				414	//int32_t i = -1;
				415	//for (;;) {
				416	// if (start < list[++i]) break;
				417	//}
				418	int32_t i = findCodePoint(start);
				419	return ((i & 1) != 0 && end < list[i]);
				420	}
				421
				422	/**
				423	* Returns <tt>true</tt> if this set contains the given
				424	* multicharacter string.
				425	* @param s string to be checked for containment
				426	* @return <tt>true</tt> if this set contains the specified string
				427	*/
				428	UBool UnicodeSet::contains(const UnicodeString& s) const {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	429	int32_t cp = getSingleCP(s);
				430	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	431	return stringsContains(s);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	432	} else {
				433	return contains((UChar32) cp);
				434	}
				435	}
				436
				437	/**
				438	* Returns true if this set contains all the characters and strings
				439	* of the given set.
				440	* @param c set to be checked for containment
				441	* @return true if the test condition is met
				442	*/
				443	UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
				444	// The specified set is a subset if all of its pairs are contained in
				445	// this set. It's possible to code this more efficiently in terms of
				446	// direct manipulation of the inversion lists if the need arises.
				447	int32_t n = c.getRangeCount();
				448	for (int i=0; i<n; ++i) {
				449	if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	450	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	451	}
				452	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	453	return !c.hasStrings() \|\| (strings != nullptr && strings->containsAll(*c.strings));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	454	}
				455
				456	/**
				457	* Returns true if this set contains all the characters
				458	* of the given string.
				459	* @param s string containing characters to be checked for containment
				460	* @return true if the test condition is met
				461	*/
				462	UBool UnicodeSet::containsAll(const UnicodeString& s) const {
				463	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
				464	s.length());
				465	}
				466
				467	/**
				468	* Returns true if this set contains none of the characters
				469	* of the given range.
				470	* @param start first character, inclusive, of the range
				471	* @param end last character, inclusive, of the range
				472	* @return true if the test condition is met
				473	*/
				474	UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
				475	//int32_t i = -1;
				476	//for (;;) {
				477	// if (start < list[++i]) break;
				478	//}
				479	int32_t i = findCodePoint(start);
				480	return ((i & 1) == 0 && end < list[i]);
				481	}
				482
				483	/**
				484	* Returns true if this set contains none of the characters and strings
				485	* of the given set.
				486	* @param c set to be checked for containment
				487	* @return true if the test condition is met
				488	*/
				489	UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
				490	// The specified set is a subset if all of its pairs are contained in
				491	// this set. It's possible to code this more efficiently in terms of
				492	// direct manipulation of the inversion lists if the need arises.
				493	int32_t n = c.getRangeCount();
				494	for (int32_t i=0; i<n; ++i) {
				495	if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	496	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	497	}
				498	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	499	return strings == nullptr \|\| !c.hasStrings() \|\| strings->containsNone(*c.strings);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	500	}
				501
				502	/**
				503	* Returns true if this set contains none of the characters
				504	* of the given string.
				505	* @param s string containing characters to be checked for containment
				506	* @return true if the test condition is met
				507	*/
				508	UBool UnicodeSet::containsNone(const UnicodeString& s) const {
				509	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
				510	s.length());
				511	}
				512
				513	/**
				514	* Returns <tt>true</tt> if this set contains any character whose low byte
				515	* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
				516	* indexing.
				517	*/
				518	UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
				519	/* The index value v, in the range [0,255], is contained in this set if
				520	* it is contained in any pair of this set. Pairs either have the high
				521	* bytes equal, or unequal. If the high bytes are equal, then we have
				522	* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
				523	* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
				524	* Then v is contained if xx <= v \|\| v <= yy. (This is identical to the
				525	* time zone month containment logic.)
				526	*/
				527	int32_t i;
				528	int32_t rangeCount=getRangeCount();
				529	for (i=0; i<rangeCount; ++i) {
				530	UChar32 low = getRangeStart(i);
				531	UChar32 high = getRangeEnd(i);
				532	if ((low & ~0xFF) == (high & ~0xFF)) {
				533	if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	534	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	535	}
				536	} else if ((low & 0xFF) <= v \|\| v <= (high & 0xFF)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	537	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	538	}
				539	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	540	if (hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	541	for (i=0; i<strings->size(); ++i) {
				542	const UnicodeString& s = (const UnicodeString)strings->elementAt(i);
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	543	if (s.isEmpty()) {
				544	continue; // skip the empty string
				545	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	546	UChar32 c = s.char32At(0);
				547	if ((c & 0xFF) == v) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	548	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	549	}
				550	}
				551	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	552	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	553	}
				554
				555	/**
				556	* Implementation of UnicodeMatcher::matches(). Always matches the
				557	* longest possible multichar string.
				558	*/
				559	UMatchDegree UnicodeSet::matches(const Replaceable& text,
				560	int32_t& offset,
				561	int32_t limit,
				562	UBool incremental) {
				563	if (offset == limit) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	564	if (contains(U_ETHER)) {
				565	return incremental ? U_PARTIAL_MATCH : U_MATCH;
				566	} else {
				567	return U_MISMATCH;
				568	}
				569	} else {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	570	if (hasStrings()) { // try strings first
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	571
				572	// might separate forward and backward loops later
				573	// for now they are combined
				574
				575	// TODO Improve efficiency of this, at least in the forward
				576	// direction, if not in both. In the forward direction we
				577	// can assume the strings are sorted.
				578
				579	int32_t i;
				580	UBool forward = offset < limit;
				581
				582	// firstChar is the leftmost char to match in the
				583	// forward direction or the rightmost char to match in
				584	// the reverse direction.
				585	UChar firstChar = text.charAt(offset);
				586
				587	// If there are multiple strings that can match we
				588	// return the longest match.
				589	int32_t highWaterLength = 0;
				590
				591	for (i=0; i<strings->size(); ++i) {
				592	const UnicodeString& trial = (const UnicodeString)strings->elementAt(i);
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	593	if (trial.isEmpty()) {
				594	continue; // skip the empty string
				595	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	596
				597	UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
				598
				599	// Strings are sorted, so we can optimize in the
				600	// forward direction.
				601	if (forward && c > firstChar) break;
				602	if (c != firstChar) continue;
				603
				604	int32_t matchLen = matchRest(text, offset, limit, trial);
				605
				606	if (incremental) {
				607	int32_t maxLen = forward ? limit-offset : offset-limit;
				608	if (matchLen == maxLen) {
				609	// We have successfully matched but only up to limit.
				610	return U_PARTIAL_MATCH;
				611	}
				612	}
				613
				614	if (matchLen == trial.length()) {
				615	// We have successfully matched the whole string.
				616	if (matchLen > highWaterLength) {
				617	highWaterLength = matchLen;
				618	}
				619	// In the forward direction we know strings
				620	// are sorted so we can bail early.
				621	if (forward && matchLen < highWaterLength) {
				622	break;
				623	}
				624	continue;
				625	}
				626	}
				627
				628	// We've checked all strings without a partial match.
				629	// If we have full matches, return the longest one.
				630	if (highWaterLength != 0) {
				631	offset += forward ? highWaterLength : -highWaterLength;
				632	return U_MATCH;
				633	}
				634	}
				635	return UnicodeFilter::matches(text, offset, limit, incremental);
				636	}
				637	}
				638
				639	/**
				640	* Returns the longest match for s in text at the given position.
				641	* If limit > start then match forward from start+1 to limit
				642	* matching all characters except s.charAt(0). If limit < start,
				643	* go backward starting from start-1 matching all characters
				644	* except s.charAt(s.length()-1). This method assumes that the
				645	* first character, text.charAt(start), matches s, so it does not
				646	* check it.
				647	* @param text the text to match
				648	* @param start the first character to match. In the forward
				649	* direction, text.charAt(start) is matched against s.charAt(0).
				650	* In the reverse direction, it is matched against
				651	* s.charAt(s.length()-1).
				652	* @param limit the limit offset for matching, either last+1 in
				653	* the forward direction, or last-1 in the reverse direction,
				654	* where last is the index of the last character to match.
				655	* @return If part of s matches up to the limit, return \|limit -
				656	* start\|. If all of s matches before reaching the limit, return
				657	* s.length(). If there is a mismatch between s and text, return
				658	* 0
				659	*/
				660	int32_t UnicodeSet::matchRest(const Replaceable& text,
				661	int32_t start, int32_t limit,
				662	const UnicodeString& s) {
				663	int32_t i;
				664	int32_t maxLen;
				665	int32_t slen = s.length();
				666	if (start < limit) {
				667	maxLen = limit - start;
				668	if (maxLen > slen) maxLen = slen;
				669	for (i = 1; i < maxLen; ++i) {
				670	if (text.charAt(start + i) != s.charAt(i)) return 0;
				671	}
				672	} else {
				673	maxLen = start - limit;
				674	if (maxLen > slen) maxLen = slen;
				675	--slen; // <=> slen = s.length() - 1;
				676	for (i = 1; i < maxLen; ++i) {
				677	if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
				678	}
				679	}
				680	return maxLen;
				681	}
				682
				683	/**
				684	* Implement of UnicodeMatcher
				685	*/
				686	void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
				687	toUnionTo.addAll(*this);
				688	}
				689
				690	/**
				691	* Returns the index of the given character within this set, where
				692	* the set is ordered by ascending code point. If the character
				693	* is not in this set, return -1. The inverse of this method is
				694	* <code>charAt()</code>.
				695	* @return an index from 0..size()-1, or -1
				696	*/
				697	int32_t UnicodeSet::indexOf(UChar32 c) const {
				698	if (c < MIN_VALUE \|\| c > MAX_VALUE) {
				699	return -1;
				700	}
				701	int32_t i = 0;
				702	int32_t n = 0;
				703	for (;;) {
				704	UChar32 start = list[i++];
				705	if (c < start) {
				706	return -1;
				707	}
				708	UChar32 limit = list[i++];
				709	if (c < limit) {
				710	return n + c - start;
				711	}
				712	n += limit - start;
				713	}
				714	}
				715
				716	/**
				717	* Returns the character at the given index within this set, where
				718	* the set is ordered by ascending code point. If the index is
				719	* out of range, return (UChar32)-1. The inverse of this method is
				720	* <code>indexOf()</code>.
				721	* @param index an index from 0..size()-1
				722	* @return the character at the given index, or (UChar32)-1.
				723	*/
				724	UChar32 UnicodeSet::charAt(int32_t index) const {
				725	if (index >= 0) {
				726	// len2 is the largest even integer <= len, that is, it is len
				727	// for even values and len-1 for odd values. With odd values
				728	// the last entry is UNICODESET_HIGH.
				729	int32_t len2 = len & ~1;
				730	for (int32_t i=0; i < len2;) {
				731	UChar32 start = list[i++];
				732	int32_t count = list[i++] - start;
				733	if (index < count) {
				734	return (UChar32)(start + index);
				735	}
				736	index -= count;
				737	}
				738	}
				739	return (UChar32)-1;
				740	}
				741
				742	/**
				743	* Make this object represent the range <code>start - end</code>.
				744	* If <code>end > start</code> then this object is set to an
				745	* an empty range.
				746	*
				747	* @param start first character in the set, inclusive
				748	* @rparam end last character in the set, inclusive
				749	*/
				750	UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
				751	clear();
				752	complement(start, end);
				753	return *this;
				754	}
				755
				756	/**
				757	* Adds the specified range to this set if it is not already
				758	* present. If this set already contains the specified range,
				759	* the call leaves this set unchanged. If <code>end > start</code>
				760	* then an empty range is added, leaving the set unchanged.
				761	*
				762	* @param start first character, inclusive, of range to be added
				763	* to this set.
				764	* @param end last character, inclusive, of range to be added
				765	* to this set.
				766	*/
				767	UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
				768	if (pinCodePoint(start) < pinCodePoint(end)) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	769	UChar32 limit = end + 1;
				770	// Fast path for adding a new range after the last one.
				771	// Odd list length: [..., lastStart, lastLimit, HIGH]
				772	if ((len & 1) != 0) {
				773	// If the list is empty, set lastLimit low enough to not be adjacent to 0.
				774	UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
				775	if (lastLimit <= start && !isFrozen() && !isBogus()) {
				776	if (lastLimit == start) {
				777	// Extend the last range.
				778	list[len - 2] = limit;
				779	if (limit == UNICODESET_HIGH) {
				780	--len;
				781	}
				782	} else {
				783	list[len - 1] = start;
				784	if (limit < UNICODESET_HIGH) {
				785	if (ensureCapacity(len + 2)) {
				786	list[len++] = limit;
				787	list[len++] = UNICODESET_HIGH;
				788	}
				789	} else { // limit == UNICODESET_HIGH
				790	if (ensureCapacity(len + 1)) {
				791	list[len++] = UNICODESET_HIGH;
				792	}
				793	}
				794	}
				795	releasePattern();
				796	return *this;
				797	}
				798	}
				799	// This is slow. Could be much faster using findCodePoint(start)
				800	// and modifying the list, dealing with adjacent & overlapping ranges.
				801	UChar32 range[3] = { start, limit, UNICODESET_HIGH };
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	802	add(range, 2, 0);
				803	} else if (start == end) {
				804	add(start);
				805	}
				806	return *this;
				807	}
				808
				809	// #define DEBUG_US_ADD
				810
				811	#ifdef DEBUG_US_ADD
				812	#include <stdio.h>
				813	void dump(UChar32 c) {
				814	if (c <= 0xFF) {
				815	printf("%c", (char)c);
				816	} else {
				817	printf("U+%04X", c);
				818	}
				819	}
				820	void dump(const UChar32* list, int32_t len) {
				821	printf("[");
				822	for (int32_t i=0; i<len; ++i) {
				823	if (i != 0) printf(", ");
				824	dump(list[i]);
				825	}
				826	printf("]");
				827	}
				828	#endif
				829
				830	/**
				831	* Adds the specified character to this set if it is not already
				832	* present. If this set already contains the specified character,
				833	* the call leaves this set unchanged.
				834	*/
				835	UnicodeSet& UnicodeSet::add(UChar32 c) {
				836	// find smallest i such that c < list[i]
				837	// if odd, then it is IN the set
				838	// if even, then it is OUT of the set
				839	int32_t i = findCodePoint(pinCodePoint(c));
				840
				841	// already in set?
				842	if ((i & 1) != 0 \|\| isFrozen() \|\| isBogus()) return *this;
				843
				844	// HIGH is 0x110000
				845	// assert(list[len-1] == HIGH);
				846
				847	// empty = [HIGH]
				848	// [start_0, limit_0, start_1, limit_1, HIGH]
				849
				850	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
				851	// ^
				852	// list[i]
				853
				854	// i == 0 means c is before the first range
				855
				856	#ifdef DEBUG_US_ADD
				857	printf("Add of ");
				858	dump(c);
				859	printf(" found at %d", i);
				860	printf(": ");
				861	dump(list, len);
				862	printf(" => ");
				863	#endif
				864
				865	if (c == list[i]-1) {
				866	// c is before start of next range
				867	list[i] = c;
				868	// if we touched the HIGH mark, then add a new one
				869	if (c == (UNICODESET_HIGH - 1)) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	870	if (!ensureCapacity(len+1)) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	871	// ensureCapacity will mark the object as Bogus if OOM failure happens.
				872	return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	873	}
				874	list[len++] = UNICODESET_HIGH;
				875	}
				876	if (i > 0 && c == list[i-1]) {
				877	// collapse adjacent ranges
				878
				879	// [..., start_k-1, c, c, limit_k, ..., HIGH]
				880	// ^
				881	// list[i]
				882
				883	//for (int32_t k=i-1; k<len-2; ++k) {
				884	// list[k] = list[k+2];
				885	//}
				886	UChar32* dst = list + i - 1;
				887	UChar32* src = dst + 2;
				888	UChar32* srclimit = list + len;
				889	while (src < srclimit) (dst++) = (src++);
				890
				891	len -= 2;
				892	}
				893	}
				894
				895	else if (i > 0 && c == list[i-1]) {
				896	// c is after end of prior range
				897	list[i-1]++;
				898	// no need to check for collapse here
				899	}
				900
				901	else {
				902	// At this point we know the new char is not adjacent to
				903	// any existing ranges, and it is not 10FFFF.
				904
				905
				906	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
				907	// ^
				908	// list[i]
				909
				910	// [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
				911	// ^
				912	// list[i]
				913
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	914	if (!ensureCapacity(len+2)) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	915	// ensureCapacity will mark the object as Bogus if OOM failure happens.
				916	return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	917	}
				918
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	919	UChar32 *p = list + i;
				920	uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	921	list[i] = c;
				922	list[i+1] = c+1;
				923	len += 2;
				924	}
				925
				926	#ifdef DEBUG_US_ADD
				927	dump(list, len);
				928	printf("\n");
				929
				930	for (i=1; i<len; ++i) {
				931	if (list[i] <= list[i-1]) {
				932	// Corrupt array!
				933	printf("ERROR: list has been corrupted\n");
				934	exit(1);
				935	}
				936	}
				937	#endif
				938
				939	releasePattern();
				940	return *this;
				941	}
				942
				943	/**
				944	* Adds the specified multicharacter to this set if it is not already
				945	* present. If this set already contains the multicharacter,
				946	* the call leaves this set unchanged.
				947	* Thus "ch" => {"ch"}
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	948	*
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	949	* @param s the source string
				950	* @return the modified set, for chaining
				951	*/
				952	UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	953	if (isFrozen() \|\| isBogus()) return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	954	int32_t cp = getSingleCP(s);
				955	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	956	if (!stringsContains(s)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	957	_add(s);
				958	releasePattern();
				959	}
				960	} else {
				961	add((UChar32)cp);
				962	}
				963	return *this;
				964	}
				965
				966	/**
				967	* Adds the given string, in order, to 'strings'. The given string
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	968	* must have been checked by the caller to not already be in 'strings'.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	969	*/
				970	void UnicodeSet::_add(const UnicodeString& s) {
				971	if (isFrozen() \|\| isBogus()) {
				972	return;
				973	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	974	UErrorCode ec = U_ZERO_ERROR;
				975	if (strings == nullptr && !allocateStrings(ec)) {
				976	setToBogus();
				977	return;
				978	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	979	UnicodeString* t = new UnicodeString(s);
				980	if (t == NULL) { // Check for memory allocation error.
				981	setToBogus();
				982	return;
				983	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	984	strings->sortedInsert(t, compareUnicodeString, ec);
				985	if (U_FAILURE(ec)) {
				986	setToBogus();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	987	}
				988	}
				989
				990	/**
				991	* @return a code point IF the string consists of a single one.
				992	* otherwise returns -1.
				993	* @param string to test
				994	*/
				995	int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	996	int32_t sLength = s.length();
				997	if (sLength == 1) return s.charAt(0);
				998	if (sLength == 2) {
				999	UChar32 cp = s.char32At(0);
				1000	if (cp > 0xFFFF) { // is surrogate pair
				1001	return cp;
				1002	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1003	}
				1004	return -1;
				1005	}
				1006
				1007	/**
				1008	* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
				1009	* If this set already any particular character, it has no effect on that character.
				1010	* @param the source string
				1011	* @return the modified set, for chaining
				1012	*/
				1013	UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
				1014	UChar32 cp;
				1015	for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
				1016	cp = s.char32At(i);
				1017	add(cp);
				1018	}
				1019	return *this;
				1020	}
				1021
				1022	/**
				1023	* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
				1024	* If this set already any particular character, it has no effect on that character.
				1025	* @param the source string
				1026	* @return the modified set, for chaining
				1027	*/
				1028	UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
				1029	UnicodeSet set;
				1030	set.addAll(s);
				1031	retainAll(set);
				1032	return *this;
				1033	}
				1034
				1035	/**
				1036	* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
				1037	* If this set already any particular character, it has no effect on that character.
				1038	* @param the source string
				1039	* @return the modified set, for chaining
				1040	*/
				1041	UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
				1042	UnicodeSet set;
				1043	set.addAll(s);
				1044	complementAll(set);
				1045	return *this;
				1046	}
				1047
				1048	/**
				1049	* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
				1050	* If this set already any particular character, it has no effect on that character.
				1051	* @param the source string
				1052	* @return the modified set, for chaining
				1053	*/
				1054	UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
				1055	UnicodeSet set;
				1056	set.addAll(s);
				1057	removeAll(set);
				1058	return *this;
				1059	}
				1060
				1061	UnicodeSet& UnicodeSet::removeAllStrings() {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1062	if (!isFrozen() && hasStrings()) {
				1063	strings->removeAllElements();
				1064	releasePattern();
				1065	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1066	return *this;
				1067	}
				1068
				1069
				1070	/**
				1071	* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
				1072	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
				1073	* @param the source string
				1074	* @return a newly created set containing the given string
				1075	*/
				1076	UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
				1077	UnicodeSet *set = new UnicodeSet();
				1078	if (set != NULL) { // Check for memory allocation error.
				1079	set->add(s);
				1080	}
				1081	return set;
				1082	}
				1083
				1084
				1085	/**
				1086	* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
				1087	* @param the source string
				1088	* @return a newly created set containing the given characters
				1089	*/
				1090	UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
				1091	UnicodeSet *set = new UnicodeSet();
				1092	if (set != NULL) { // Check for memory allocation error.
				1093	set->addAll(s);
				1094	}
				1095	return set;
				1096	}
				1097
				1098	/**
				1099	* Retain only the elements in this set that are contained in the
				1100	* specified range. If <code>end > start</code> then an empty range is
				1101	* retained, leaving the set empty.
				1102	*
				1103	* @param start first character, inclusive, of range to be retained
				1104	* to this set.
				1105	* @param end last character, inclusive, of range to be retained
				1106	* to this set.
				1107	*/
				1108	UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
				1109	if (pinCodePoint(start) <= pinCodePoint(end)) {
				1110	UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
				1111	retain(range, 2, 0);
				1112	} else {
				1113	clear();
				1114	}
				1115	return *this;
				1116	}
				1117
				1118	UnicodeSet& UnicodeSet::retain(UChar32 c) {
				1119	return retain(c, c);
				1120	}
				1121
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	1122	UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
				1123	if (isFrozen() \|\| isBogus()) { return *this; }
				1124	UChar32 cp = getSingleCP(s);
				1125	if (cp < 0) {
				1126	bool isIn = stringsContains(s);
				1127	// Check for getRangeCount() first to avoid somewhat-expensive size()
				1128	// when there are single code points.
				1129	if (isIn && getRangeCount() == 0 && size() == 1) {
				1130	return *this;
				1131	}
				1132	clear();
				1133	if (isIn) {
				1134	_add(s);
				1135	}
				1136	} else {
				1137	retain(cp, cp);
				1138	}
				1139	return *this;
				1140	}
				1141
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1142	/**
				1143	* Removes the specified range from this set if it is present.
				1144	* The set will not contain the specified range once the call
				1145	* returns. If <code>end > start</code> then an empty range is
				1146	* removed, leaving the set unchanged.
				1147	*
				1148	* @param start first character, inclusive, of range to be removed
				1149	* from this set.
				1150	* @param end last character, inclusive, of range to be removed
				1151	* from this set.
				1152	*/
				1153	UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
				1154	if (pinCodePoint(start) <= pinCodePoint(end)) {
				1155	UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
				1156	retain(range, 2, 2);
				1157	}
				1158	return *this;
				1159	}
				1160
				1161	/**
				1162	* Removes the specified character from this set if it is present.
				1163	* The set will not contain the specified range once the call
				1164	* returns.
				1165	*/
				1166	UnicodeSet& UnicodeSet::remove(UChar32 c) {
				1167	return remove(c, c);
				1168	}
				1169
				1170	/**
				1171	* Removes the specified string from this set if it is present.
				1172	* The set will not contain the specified character once the call
				1173	* returns.
				1174	* @param the source string
				1175	* @return the modified set, for chaining
				1176	*/
				1177	UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	1178	if (isFrozen() \|\| isBogus()) return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1179	int32_t cp = getSingleCP(s);
				1180	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1181	if (strings != nullptr && strings->removeElement((void*) &s)) {
				1182	releasePattern();
				1183	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1184	} else {
				1185	remove((UChar32)cp, (UChar32)cp);
				1186	}
				1187	return *this;
				1188	}
				1189
				1190	/**
				1191	* Complements the specified range in this set. Any character in
				1192	* the range will be removed if it is in this set, or will be
				1193	* added if it is not in this set. If <code>end > start</code>
				1194	* then an empty range is xor'ed, leaving the set unchanged.
				1195	*
				1196	* @param start first character, inclusive, of range to be removed
				1197	* from this set.
				1198	* @param end last character, inclusive, of range to be removed
				1199	* from this set.
				1200	*/
				1201	UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
				1202	if (isFrozen() \|\| isBogus()) {
				1203	return *this;
				1204	}
				1205	if (pinCodePoint(start) <= pinCodePoint(end)) {
				1206	UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
				1207	exclusiveOr(range, 2, 0);
				1208	}
				1209	releasePattern();
				1210	return *this;
				1211	}
				1212
				1213	UnicodeSet& UnicodeSet::complement(UChar32 c) {
				1214	return complement(c, c);
				1215	}
				1216
				1217	/**
				1218	* This is equivalent to
				1219	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
				1220	*/
				1221	UnicodeSet& UnicodeSet::complement(void) {
				1222	if (isFrozen() \|\| isBogus()) {
				1223	return *this;
				1224	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1225	if (list[0] == UNICODESET_LOW) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1226	uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1227	--len;
				1228	} else {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1229	if (!ensureCapacity(len+1)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1230	return *this;
				1231	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1232	uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
				1233	list[0] = UNICODESET_LOW;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1234	++len;
				1235	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1236	releasePattern();
				1237	return *this;
				1238	}
				1239
				1240	/**
				1241	* Complement the specified string in this set.
				1242	* The set will not contain the specified string once the call
				1243	* returns.
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	1244	*
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1245	* @param s the string to complement
				1246	* @return this object, for chaining
				1247	*/
				1248	UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	1249	if (isFrozen() \|\| isBogus()) return *this;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1250	int32_t cp = getSingleCP(s);
				1251	if (cp < 0) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1252	if (stringsContains(s)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1253	strings->removeElement((void*) &s);
				1254	} else {
				1255	_add(s);
				1256	}
				1257	releasePattern();
				1258	} else {
				1259	complement((UChar32)cp, (UChar32)cp);
				1260	}
				1261	return *this;
				1262	}
				1263
				1264	/**
				1265	* Adds all of the elements in the specified set to this set if
				1266	* they're not already present. This operation effectively
				1267	* modifies this set so that its value is the <i>union</i> of the two
				1268	* sets. The behavior of this operation is unspecified if the specified
				1269	* collection is modified while the operation is in progress.
				1270	*
				1271	* @param c set whose elements are to be added to this set.
				1272	* @see #add(char, char)
				1273	*/
				1274	UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
				1275	if ( c.len>0 && c.list!=NULL ) {
				1276	add(c.list, c.len, 0);
				1277	}
				1278
				1279	// Add strings in order
				1280	if ( c.strings!=NULL ) {
				1281	for (int32_t i=0; i<c.strings->size(); ++i) {
				1282	const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1283	if (!stringsContains(*s)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1284	_add(*s);
				1285	}
				1286	}
				1287	}
				1288	return *this;
				1289	}
				1290
				1291	/**
				1292	* Retains only the elements in this set that are contained in the
				1293	* specified set. In other words, removes from this set all of
				1294	* its elements that are not contained in the specified set. This
				1295	* operation effectively modifies this set so that its value is
				1296	* the <i>intersection</i> of the two sets.
				1297	*
				1298	* @param c set that defines which elements this set will retain.
				1299	*/
				1300	UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
				1301	if (isFrozen() \|\| isBogus()) {
				1302	return *this;
				1303	}
				1304	retain(c.list, c.len, 0);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1305	if (hasStrings()) {
				1306	if (!c.hasStrings()) {
				1307	strings->removeAllElements();
				1308	} else {
				1309	strings->retainAll(*c.strings);
				1310	}
				1311	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1312	return *this;
				1313	}
				1314
				1315	/**
				1316	* Removes from this set all of its elements that are contained in the
				1317	* specified set. This operation effectively modifies this
				1318	* set so that its value is the <i>asymmetric set difference</i> of
				1319	* the two sets.
				1320	*
				1321	* @param c set that defines which elements will be removed from
				1322	* this set.
				1323	*/
				1324	UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
				1325	if (isFrozen() \|\| isBogus()) {
				1326	return *this;
				1327	}
				1328	retain(c.list, c.len, 2);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1329	if (hasStrings() && c.hasStrings()) {
				1330	strings->removeAll(*c.strings);
				1331	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1332	return *this;
				1333	}
				1334
				1335	/**
				1336	* Complements in this set all elements contained in the specified
				1337	* set. Any character in the other set will be removed if it is
				1338	* in this set, or will be added if it is not in this set.
				1339	*
				1340	* @param c set that defines which elements will be xor'ed from
				1341	* this set.
				1342	*/
				1343	UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
				1344	if (isFrozen() \|\| isBogus()) {
				1345	return *this;
				1346	}
				1347	exclusiveOr(c.list, c.len, 0);
				1348
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1349	if (c.strings != nullptr) {
				1350	for (int32_t i=0; i<c.strings->size(); ++i) {
				1351	void* e = c.strings->elementAt(i);
				1352	if (strings == nullptr \|\| !strings->removeElement(e)) {
				1353	_add((const UnicodeString)e);
				1354	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1355	}
				1356	}
				1357	return *this;
				1358	}
				1359
				1360	/**
				1361	* Removes all of the elements from this set. This set will be
				1362	* empty after this call returns.
				1363	*/
				1364	UnicodeSet& UnicodeSet::clear(void) {
				1365	if (isFrozen()) {
				1366	return *this;
				1367	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1368	list[0] = UNICODESET_HIGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1369	len = 1;
				1370	releasePattern();
				1371	if (strings != NULL) {
				1372	strings->removeAllElements();
				1373	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1374	// Remove bogus
				1375	fFlags = 0;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1376	return *this;
				1377	}
				1378
				1379	/**
				1380	* Iteration method that returns the number of ranges contained in
				1381	* this set.
				1382	* @see #getRangeStart
				1383	* @see #getRangeEnd
				1384	*/
				1385	int32_t UnicodeSet::getRangeCount() const {
				1386	return len/2;
				1387	}
				1388
				1389	/**
				1390	* Iteration method that returns the first character in the
				1391	* specified range of this set.
				1392	* @see #getRangeCount
				1393	* @see #getRangeEnd
				1394	*/
				1395	UChar32 UnicodeSet::getRangeStart(int32_t index) const {
				1396	return list[index*2];
				1397	}
				1398
				1399	/**
				1400	* Iteration method that returns the last character in the
				1401	* specified range of this set.
				1402	* @see #getRangeStart
				1403	* @see #getRangeEnd
				1404	*/
				1405	UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
				1406	return list[index*2 + 1] - 1;
				1407	}
				1408
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1409	const UnicodeString* UnicodeSet::getString(int32_t index) const {
				1410	return (const UnicodeString*) strings->elementAt(index);
				1411	}
				1412
				1413	/**
				1414	* Reallocate this objects internal structures to take up the least
				1415	* possible space, without changing this object's value.
				1416	*/
				1417	UnicodeSet& UnicodeSet::compact() {
				1418	if (isFrozen() \|\| isBogus()) {
				1419	return *this;
				1420	}
				1421	// Delete buffer first to defragment memory less.
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1422	if (buffer != stackList) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1423	uprv_free(buffer);
				1424	buffer = NULL;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1425	bufferCapacity = 0;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1426	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1427	if (list == stackList) {
				1428	// pass
				1429	} else if (len <= INITIAL_CAPACITY) {
				1430	uprv_memcpy(stackList, list, len * sizeof(UChar32));
				1431	uprv_free(list);
				1432	list = stackList;
				1433	capacity = INITIAL_CAPACITY;
				1434	} else if ((len + 7) < capacity) {
				1435	// If we have more than a little unused capacity, shrink it to len.
				1436	UChar32* temp = (UChar32) uprv_realloc(list, sizeof(UChar32) len);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1437	if (temp) {
				1438	list = temp;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1439	capacity = len;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1440	}
				1441	// else what the heck happened?! We allocated less memory!
				1442	// Oh well. We'll keep our original array.
				1443	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1444	if (strings != nullptr && strings->isEmpty()) {
				1445	delete strings;
				1446	strings = nullptr;
				1447	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1448	return *this;
				1449	}
				1450
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1451	#ifdef DEBUG_SERIALIZE
				1452	#include <stdio.h>
				1453	#endif
				1454
				1455	/**
				1456	* Deserialize constructor.
				1457	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1458	UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
				1459	UErrorCode &ec) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1460
				1461	if(U_FAILURE(ec)) {
				1462	setToBogus();
				1463	return;
				1464	}
				1465
				1466	if( (serialization != kSerialized)
				1467	\|\| (data==NULL)
				1468	\|\| (dataLen < 1)) {
				1469	ec = U_ILLEGAL_ARGUMENT_ERROR;
				1470	setToBogus();
				1471	return;
				1472	}
				1473
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1474	// bmp?
				1475	int32_t headerSize = ((data[0]&0x8000)) ?2:1;
				1476	int32_t bmpLength = (headerSize==1)?data[0]:data[1];
				1477
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1478	int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1479	#ifdef DEBUG_SERIALIZE
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1480	printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1481	#endif
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1482	if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1483	return;
				1484	}
				1485	// copy bmp
				1486	int32_t i;
				1487	for(i = 0; i< bmpLength;i++) {
				1488	list[i] = data[i+headerSize];
				1489	#ifdef DEBUG_SERIALIZE
				1490	printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
				1491	#endif
				1492	}
				1493	// copy smp
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1494	for(i=bmpLength;i<newLength;i++) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1495	list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
				1496	((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
				1497	#ifdef DEBUG_SERIALIZE
				1498	printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
				1499	#endif
				1500	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1501	U_ASSERT(i == newLength);
				1502	if (i == 0 \|\| list[i - 1] != UNICODESET_HIGH) {
				1503	list[i++] = UNICODESET_HIGH;
				1504	}
				1505	len = i;
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1506	}
				1507
				1508
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1509	int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
				1510	int32_t bmpLength, length, destLength;
				1511
				1512	if (U_FAILURE(ec)) {
				1513	return 0;
				1514	}
				1515
				1516	if (destCapacity<0 \|\| (destCapacity>0 && dest==NULL)) {
				1517	ec=U_ILLEGAL_ARGUMENT_ERROR;
				1518	return 0;
				1519	}
				1520
				1521	/* count necessary 16-bit units */
				1522	length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
				1523	// assert(length>=0);
				1524	if (length==0) {
				1525	/* empty set */
				1526	if (destCapacity>0) {
				1527	*dest=0;
				1528	} else {
				1529	ec=U_BUFFER_OVERFLOW_ERROR;
				1530	}
				1531	return 1;
				1532	}
				1533	/* now length>0 */
				1534
				1535	if (this->list[length-1]<=0xffff) {
				1536	/* all BMP */
				1537	bmpLength=length;
				1538	} else if (this->list[0]>=0x10000) {
				1539	/* all supplementary */
				1540	bmpLength=0;
				1541	length*=2;
				1542	} else {
				1543	/* some BMP, some supplementary */
				1544	for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
				1545	length=bmpLength+2*(length-bmpLength);
				1546	}
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1547	#ifdef DEBUG_SERIALIZE
				1548	printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
				1549	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1550	/* length: number of 16-bit array units */
				1551	if (length>0x7fff) {
				1552	/* there are only 15 bits for the length in the first serialized word */
				1553	ec=U_INDEX_OUTOFBOUNDS_ERROR;
				1554	return 0;
				1555	}
				1556
				1557	/*
				1558	* total serialized length:
				1559	* number of 16-bit array units (length) +
				1560	* 1 length unit (always) +
				1561	* 1 bmpLength unit (if there are supplementary values)
				1562	*/
				1563	destLength=length+((length>bmpLength)?2:1);
				1564	if (destLength<=destCapacity) {
				1565	const UChar32 *p;
				1566	int32_t i;
				1567
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1568	#ifdef DEBUG_SERIALIZE
				1569	printf("writeHdr\n");
				1570	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1571	*dest=(uint16_t)length;
				1572	if (length>bmpLength) {
				1573	*dest\|=0x8000;
				1574	*++dest=(uint16_t)bmpLength;
				1575	}
				1576	++dest;
				1577
				1578	/* write the BMP part of the array */
				1579	p=this->list;
				1580	for (i=0; i<bmpLength; ++i) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1581	#ifdef DEBUG_SERIALIZE
				1582	printf("writebmp: %x\n", (int)*p);
				1583	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1584	dest++=(uint16_t)p++;
				1585	}
				1586
				1587	/* write the supplementary part of the array */
				1588	for (; i<length; i+=2) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	1589	#ifdef DEBUG_SERIALIZE
				1590	printf("write32: %x\n", (int)*p);
				1591	#endif
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1592	dest++=(uint16_t)(p>>16);
				1593	dest++=(uint16_t)p++;
				1594	}
				1595	} else {
				1596	ec=U_BUFFER_OVERFLOW_ERROR;
				1597	}
				1598	return destLength;
				1599	}
				1600
				1601	//----------------------------------------------------------------
				1602	// Implementation: Utility methods
				1603	//----------------------------------------------------------------
				1604
				1605	/**
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1606	* Allocate our strings vector and return true if successful.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1607	*/
				1608	UBool UnicodeSet::allocateStrings(UErrorCode &status) {
				1609	if (U_FAILURE(status)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1610	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1611	}
				1612	strings = new UVector(uprv_deleteUObject,
				1613	uhash_compareUnicodeString, 1, status);
				1614	if (strings == NULL) { // Check for memory allocation error.
				1615	status = U_MEMORY_ALLOCATION_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1616	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1617	}
				1618	if (U_FAILURE(status)) {
				1619	delete strings;
				1620	strings = NULL;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1621	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1622	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1623	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1624	}
				1625
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1626	int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
				1627	// Grow exponentially to reduce the frequency of allocations.
				1628	if (minCapacity < INITIAL_CAPACITY) {
				1629	return minCapacity + INITIAL_CAPACITY;
				1630	} else if (minCapacity <= 2500) {
				1631	return 5 * minCapacity;
				1632	} else {
				1633	int32_t newCapacity = 2 * minCapacity;
				1634	if (newCapacity > MAX_LENGTH) {
				1635	newCapacity = MAX_LENGTH;
				1636	}
				1637	return newCapacity;
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	1638	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1639	}
				1640
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1641	bool UnicodeSet::ensureCapacity(int32_t newLen) {
				1642	if (newLen > MAX_LENGTH) {
				1643	newLen = MAX_LENGTH;
				1644	}
				1645	if (newLen <= capacity) {
				1646	return true;
				1647	}
				1648	int32_t newCapacity = nextCapacity(newLen);
				1649	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1650	if (temp == NULL) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1651	setToBogus(); // set the object to bogus state if an OOM failure occurred.
				1652	return false;
				1653	}
				1654	// Copy only the actual contents.
				1655	uprv_memcpy(temp, list, len * sizeof(UChar32));
				1656	if (list != stackList) {
				1657	uprv_free(list);
				1658	}
				1659	list = temp;
				1660	capacity = newCapacity;
				1661	return true;
				1662	}
				1663
				1664	bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
				1665	if (newLen > MAX_LENGTH) {
				1666	newLen = MAX_LENGTH;
				1667	}
				1668	if (newLen <= bufferCapacity) {
				1669	return true;
				1670	}
				1671	int32_t newCapacity = nextCapacity(newLen);
				1672	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
				1673	if (temp == NULL) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1674	setToBogus();
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1675	return false;
				1676	}
				1677	// The buffer has no contents to be copied.
				1678	// It is always filled from scratch after this call.
				1679	if (buffer != stackList) {
				1680	uprv_free(buffer);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1681	}
				1682	buffer = temp;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1683	bufferCapacity = newCapacity;
				1684	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1685	}
				1686
				1687	/**
				1688	* Swap list and buffer.
				1689	*/
				1690	void UnicodeSet::swapBuffers(void) {
				1691	// swap list and buffer
				1692	UChar32* temp = list;
				1693	list = buffer;
				1694	buffer = temp;
				1695
				1696	int32_t c = capacity;
				1697	capacity = bufferCapacity;
				1698	bufferCapacity = c;
				1699	}
				1700
				1701	void UnicodeSet::setToBogus() {
				1702	clear(); // Remove everything in the set.
				1703	fFlags = kIsBogus;
				1704	}
				1705
				1706	//----------------------------------------------------------------
				1707	// Implementation: Fundamental operators
				1708	//----------------------------------------------------------------
				1709
				1710	static inline UChar32 max(UChar32 a, UChar32 b) {
				1711	return (a > b) ? a : b;
				1712	}
				1713
				1714	// polarity = 0, 3 is normal: x xor y
				1715	// polarity = 1, 2: x xor ~y == x === y
				1716
				1717	void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
				1718	if (isFrozen() \|\| isBogus()) {
				1719	return;
				1720	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1721	if (!ensureBufferCapacity(len + otherLen)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1722	return;
				1723	}
				1724
				1725	int32_t i = 0, j = 0, k = 0;
				1726	UChar32 a = list[i++];
				1727	UChar32 b;
				1728	if (polarity == 1 \|\| polarity == 2) {
				1729	b = UNICODESET_LOW;
				1730	if (other[j] == UNICODESET_LOW) { // skip base if already LOW
				1731	++j;
				1732	b = other[j];
				1733	}
				1734	} else {
				1735	b = other[j++];
				1736	}
				1737	// simplest of all the routines
				1738	// sort the values, discarding identicals!
				1739	for (;;) {
				1740	if (a < b) {
				1741	buffer[k++] = a;
				1742	a = list[i++];
				1743	} else if (b < a) {
				1744	buffer[k++] = b;
				1745	b = other[j++];
				1746	} else if (a != UNICODESET_HIGH) { // at this point, a == b
				1747	// discard both values!
				1748	a = list[i++];
				1749	b = other[j++];
				1750	} else { // DONE!
				1751	buffer[k++] = UNICODESET_HIGH;
				1752	len = k;
				1753	break;
				1754	}
				1755	}
				1756	swapBuffers();
				1757	releasePattern();
				1758	}
				1759
				1760	// polarity = 0 is normal: x union y
				1761	// polarity = 2: x union ~y
				1762	// polarity = 1: ~x union y
				1763	// polarity = 3: ~x union ~y
				1764
				1765	void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
				1766	if (isFrozen() \|\| isBogus() \|\| other==NULL) {
				1767	return;
				1768	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1769	if (!ensureBufferCapacity(len + otherLen)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1770	return;
				1771	}
				1772
				1773	int32_t i = 0, j = 0, k = 0;
				1774	UChar32 a = list[i++];
				1775	UChar32 b = other[j++];
				1776	// change from xor is that we have to check overlapping pairs
				1777	// polarity bit 1 means a is second, bit 2 means b is.
				1778	for (;;) {
				1779	switch (polarity) {
				1780	case 0: // both first; take lower if unequal
				1781	if (a < b) { // take a
				1782	// Back up over overlapping ranges in buffer[]
				1783	if (k > 0 && a <= buffer[k-1]) {
				1784	// Pick latter end value in buffer[] vs. list[]
				1785	a = max(list[i], buffer[--k]);
				1786	} else {
				1787	// No overlap
				1788	buffer[k++] = a;
				1789	a = list[i];
				1790	}
				1791	i++; // Common if/else code factored out
				1792	polarity ^= 1;
				1793	} else if (b < a) { // take b
				1794	if (k > 0 && b <= buffer[k-1]) {
				1795	b = max(other[j], buffer[--k]);
				1796	} else {
				1797	buffer[k++] = b;
				1798	b = other[j];
				1799	}
				1800	j++;
				1801	polarity ^= 2;
				1802	} else { // a == b, take a, drop b
				1803	if (a == UNICODESET_HIGH) goto loop_end;
				1804	// This is symmetrical; it doesn't matter if
				1805	// we backtrack with a or b. - liu
				1806	if (k > 0 && a <= buffer[k-1]) {
				1807	a = max(list[i], buffer[--k]);
				1808	} else {
				1809	// No overlap
				1810	buffer[k++] = a;
				1811	a = list[i];
				1812	}
				1813	i++;
				1814	polarity ^= 1;
				1815	b = other[j++];
				1816	polarity ^= 2;
				1817	}
				1818	break;
				1819	case 3: // both second; take higher if unequal, and drop other
				1820	if (b <= a) { // take a
				1821	if (a == UNICODESET_HIGH) goto loop_end;
				1822	buffer[k++] = a;
				1823	} else { // take b
				1824	if (b == UNICODESET_HIGH) goto loop_end;
				1825	buffer[k++] = b;
				1826	}
				1827	a = list[i++];
				1828	polarity ^= 1; // factored common code
				1829	b = other[j++];
				1830	polarity ^= 2;
				1831	break;
				1832	case 1: // a second, b first; if b < a, overlap
				1833	if (a < b) { // no overlap, take a
				1834	buffer[k++] = a; a = list[i++]; polarity ^= 1;
				1835	} else if (b < a) { // OVERLAP, drop b
				1836	b = other[j++];
				1837	polarity ^= 2;
				1838	} else { // a == b, drop both!
				1839	if (a == UNICODESET_HIGH) goto loop_end;
				1840	a = list[i++];
				1841	polarity ^= 1;
				1842	b = other[j++];
				1843	polarity ^= 2;
				1844	}
				1845	break;
				1846	case 2: // a first, b second; if a < b, overlap
				1847	if (b < a) { // no overlap, take b
				1848	buffer[k++] = b;
				1849	b = other[j++];
				1850	polarity ^= 2;
				1851	} else if (a < b) { // OVERLAP, drop a
				1852	a = list[i++];
				1853	polarity ^= 1;
				1854	} else { // a == b, drop both!
				1855	if (a == UNICODESET_HIGH) goto loop_end;
				1856	a = list[i++];
				1857	polarity ^= 1;
				1858	b = other[j++];
				1859	polarity ^= 2;
				1860	}
				1861	break;
				1862	}
				1863	}
				1864	loop_end:
				1865	buffer[k++] = UNICODESET_HIGH; // terminate
				1866	len = k;
				1867	swapBuffers();
				1868	releasePattern();
				1869	}
				1870
				1871	// polarity = 0 is normal: x intersect y
				1872	// polarity = 2: x intersect ~y == set-minus
				1873	// polarity = 1: ~x intersect y
				1874	// polarity = 3: ~x intersect ~y
				1875
				1876	void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
				1877	if (isFrozen() \|\| isBogus()) {
				1878	return;
				1879	}
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	1880	if (!ensureBufferCapacity(len + otherLen)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1881	return;
				1882	}
				1883
				1884	int32_t i = 0, j = 0, k = 0;
				1885	UChar32 a = list[i++];
				1886	UChar32 b = other[j++];
				1887	// change from xor is that we have to check overlapping pairs
				1888	// polarity bit 1 means a is second, bit 2 means b is.
				1889	for (;;) {
				1890	switch (polarity) {
				1891	case 0: // both first; drop the smaller
				1892	if (a < b) { // drop a
				1893	a = list[i++];
				1894	polarity ^= 1;
				1895	} else if (b < a) { // drop b
				1896	b = other[j++];
				1897	polarity ^= 2;
				1898	} else { // a == b, take one, drop other
				1899	if (a == UNICODESET_HIGH) goto loop_end;
				1900	buffer[k++] = a;
				1901	a = list[i++];
				1902	polarity ^= 1;
				1903	b = other[j++];
				1904	polarity ^= 2;
				1905	}
				1906	break;
				1907	case 3: // both second; take lower if unequal
				1908	if (a < b) { // take a
				1909	buffer[k++] = a;
				1910	a = list[i++];
				1911	polarity ^= 1;
				1912	} else if (b < a) { // take b
				1913	buffer[k++] = b;
				1914	b = other[j++];
				1915	polarity ^= 2;
				1916	} else { // a == b, take one, drop other
				1917	if (a == UNICODESET_HIGH) goto loop_end;
				1918	buffer[k++] = a;
				1919	a = list[i++];
				1920	polarity ^= 1;
				1921	b = other[j++];
				1922	polarity ^= 2;
				1923	}
				1924	break;
				1925	case 1: // a second, b first;
				1926	if (a < b) { // NO OVERLAP, drop a
				1927	a = list[i++];
				1928	polarity ^= 1;
				1929	} else if (b < a) { // OVERLAP, take b
				1930	buffer[k++] = b;
				1931	b = other[j++];
				1932	polarity ^= 2;
				1933	} else { // a == b, drop both!
				1934	if (a == UNICODESET_HIGH) goto loop_end;
				1935	a = list[i++];
				1936	polarity ^= 1;
				1937	b = other[j++];
				1938	polarity ^= 2;
				1939	}
				1940	break;
				1941	case 2: // a first, b second; if a < b, overlap
				1942	if (b < a) { // no overlap, drop b
				1943	b = other[j++];
				1944	polarity ^= 2;
				1945	} else if (a < b) { // OVERLAP, take a
				1946	buffer[k++] = a;
				1947	a = list[i++];
				1948	polarity ^= 1;
				1949	} else { // a == b, drop both!
				1950	if (a == UNICODESET_HIGH) goto loop_end;
				1951	a = list[i++];
				1952	polarity ^= 1;
				1953	b = other[j++];
				1954	polarity ^= 2;
				1955	}
				1956	break;
				1957	}
				1958	}
				1959	loop_end:
				1960	buffer[k++] = UNICODESET_HIGH; // terminate
				1961	len = k;
				1962	swapBuffers();
				1963	releasePattern();
				1964	}
				1965
				1966	/**
				1967	* Append the <code>toPattern()</code> representation of a
				1968	* string to the given <code>StringBuffer</code>.
				1969	*/
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1970	void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1971	UChar32 cp;
				1972	for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
				1973	_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
				1974	}
				1975	}
				1976
				1977	/**
				1978	* Append the <code>toPattern()</code> representation of a
				1979	* character to the given <code>StringBuffer</code>.
				1980	*/
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1981	void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
				1982	if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1983	// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
				1984	// unprintable
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	1985	ICU_Utility::escape(buf, c);
				1986	return;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1987	}
				1988	// Okay to let ':' pass through
				1989	switch (c) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	1990	case u'[':
				1991	case u']':
				1992	case u'-':
				1993	case u'^':
				1994	case u'&':
				1995	case u'\\':
				1996	case u'{':
				1997	case u'}':
				1998	case u':':
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1999	case SymbolTable::SYMBOL_REF:
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2000	buf.append(u'\\');
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2001	break;
				2002	default:
				2003	// Escape whitespace
				2004	if (PatternProps::isWhiteSpace(c)) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2005	buf.append(u'\\');
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2006	}
				2007	break;
				2008	}
				2009	buf.append(c);
				2010	}
				2011
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2012	void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
				2013	UBool escapeUnprintable) {
				2014	_appendToPat(result, start, escapeUnprintable);
				2015	if (start != end) {
				2016	if ((start+1) != end \|\|
				2017	// Avoid writing what looks like a lead+trail surrogate pair.
				2018	start == 0xdbff) {
				2019	result.append(u'-');
				2020	}
				2021	_appendToPat(result, end, escapeUnprintable);
				2022	}
				2023	}
				2024
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2025	/**
				2026	* Append a string representation of this set to result. This will be
				2027	* a cleaned version of the string passed to applyPattern(), if there
				2028	* is one. Otherwise it will be generated.
				2029	*/
				2030	UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
				2031	UBool escapeUnprintable) const
				2032	{
				2033	if (pat != NULL) {
				2034	int32_t i;
				2035	int32_t backslashCount = 0;
				2036	for (i=0; i<patLen; ) {
				2037	UChar32 c;
				2038	U16_NEXT(pat, i, patLen, c);
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2039	if (escapeUnprintable ?
				2040	ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2041	// If the unprintable character is preceded by an odd
				2042	// number of backslashes, then it has been escaped.
				2043	// Before unescaping it, we delete the final
				2044	// backslash.
				2045	if ((backslashCount % 2) == 1) {
				2046	result.truncate(result.length() - 1);
				2047	}
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2048	ICU_Utility::escape(result, c);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2049	backslashCount = 0;
				2050	} else {
				2051	result.append(c);
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2052	if (c == u'\\') {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2053	++backslashCount;
				2054	} else {
				2055	backslashCount = 0;
				2056	}
				2057	}
				2058	}
				2059	return result;
				2060	}
				2061
				2062	return _generatePattern(result, escapeUnprintable);
				2063	}
				2064
				2065	/**
				2066	* Returns a string representation of this set. If the result of
				2067	* calling this function is passed to a UnicodeSet constructor, it
				2068	* will produce another set that is equal to this one.
				2069	*/
				2070	UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
				2071	UBool escapeUnprintable) const
				2072	{
				2073	result.truncate(0);
				2074	return _toPattern(result, escapeUnprintable);
				2075	}
				2076
				2077	/**
				2078	* Generate and append a string representation of this set to result.
				2079	* This does not use this.pat, the cleaned up copy of the string
				2080	* passed to applyPattern().
				2081	*/
				2082	UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
				2083	UBool escapeUnprintable) const
				2084	{
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2085	result.append(u'[');
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2086
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2087	int32_t i = 0;
				2088	int32_t limit = len & ~1; // = 2 * getRangeCount()
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2089
				2090	// If the set contains at least 2 intervals and includes both
				2091	// MIN_VALUE and MAX_VALUE, then the inverse representation will
				2092	// be more economical.
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2093	// if (getRangeCount() >= 2 &&
				2094	// getRangeStart(0) == MIN_VALUE &&
				2095	// getRangeEnd(last) == MAX_VALUE)
				2096	// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
				2097	// If limit == len then len is even and the last range ends with MAX_VALUE.
				2098	//
				2099	// But do not write the inverse (complement) if there are strings.
				2100	// Since ICU 70, the '^' performs a code point complement which removes all strings.
				2101	if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2102	// Emit the inverse
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2103	result.append(u'^');
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2104	// Offsetting the inversion list index by one lets us
				2105	// iterate over the ranges of the set complement.
				2106	i = 1;
				2107	--limit;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2108	}
				2109
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	2110	// Emit the ranges as pairs.
				2111	while (i < limit) {
				2112	UChar32 start = list[i]; // getRangeStart()
				2113	UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
				2114	if (!(0xd800 <= end && end <= 0xdbff)) {
				2115	_appendToPat(result, start, end, escapeUnprintable);
				2116	i += 2;
				2117	} else {
				2118	// The range ends with a lead surrogate.
				2119	// Avoid writing what looks like a lead+trail surrogate pair.
				2120	// 1. Postpone ranges that start with a lead surrogate code point.
				2121	int32_t firstLead = i;
				2122	while ((i += 2) < limit && list[i] <= 0xdbff) {}
				2123	int32_t firstAfterLead = i;
				2124	// 2. Write following ranges that start with a trail surrogate code point.
				2125	while (i < limit && (start = list[i]) <= 0xdfff) {
				2126	_appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
				2127	i += 2;
				2128	}
				2129	// 3. Now write the postponed ranges.
				2130	for (int j = firstLead; j < firstAfterLead; j += 2) {
				2131	_appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2132	}
				2133	}
				2134	}
				2135
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2136	if (strings != nullptr) {
				2137	for (int32_t i = 0; i<strings->size(); ++i) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2138	result.append(u'{');
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2139	_appendToPat(result,
				2140	(const UnicodeString) strings->elementAt(i),
				2141	escapeUnprintable);
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2142	result.append(u'}');
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2143	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2144	}
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	2145	return result.append(u']');
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2146	}
				2147
				2148	/**
				2149	* Release existing cached pattern
				2150	*/
				2151	void UnicodeSet::releasePattern() {
				2152	if (pat) {
				2153	uprv_free(pat);
				2154	pat = NULL;
				2155	patLen = 0;
				2156	}
				2157	}
				2158
				2159	/**
				2160	* Set the new pattern to cache.
				2161	*/
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2162	void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2163	releasePattern();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2164	pat = (UChar )uprv_malloc((newPatLen + 1) sizeof(UChar));
				2165	if (pat) {
				2166	patLen = newPatLen;
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2167	u_memcpy(pat, newPat, patLen);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2168	pat[patLen] = 0;
				2169	}
				2170	// else we don't care if malloc failed. This was just a nice cache.
				2171	// We can regenerate an equivalent pattern later when requested.
				2172	}
				2173
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	2174	UnicodeSet *UnicodeSet::freeze() {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2175	if(!isFrozen() && !isBogus()) {
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2176	compact();
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2177
				2178	// Optimize contains() and span() and similar functions.
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2179	if (hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2180	stringSpan = new UnicodeSetStringSpan(this, strings, UnicodeSetStringSpan::ALL);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2181	if (stringSpan == nullptr) {
				2182	setToBogus();
				2183	return this;
				2184	} else if (!stringSpan->needsStringSpanUTF16()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2185	// All strings are irrelevant for span() etc. because
				2186	// all of each string's code points are contained in this set.
				2187	// Do not check needsStringSpanUTF8() because UTF-8 has at most as
				2188	// many relevant strings as UTF-16.
				2189	// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
				2190	delete stringSpan;
				2191	stringSpan = NULL;
				2192	}
				2193	}
				2194	if (stringSpan == NULL) {
				2195	// No span-relevant strings: Optimize for code point spans.
				2196	bmpSet=new BMPSet(list, len);
				2197	if (bmpSet == NULL) { // Check for memory allocation error.
				2198	setToBogus();
				2199	}
				2200	}
				2201	}
				2202	return this;
				2203	}
				2204
				2205	int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
				2206	if(length>0 && bmpSet!=NULL) {
				2207	return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
				2208	}
				2209	if(length<0) {
				2210	length=u_strlen(s);
				2211	}
				2212	if(length==0) {
				2213	return 0;
				2214	}
				2215	if(stringSpan!=NULL) {
				2216	return stringSpan->span(s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2217	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2218	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2219	UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
				2220	UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
				2221	UnicodeSetStringSpan strSpan(this, strings, which);
				2222	if(strSpan.needsStringSpanUTF16()) {
				2223	return strSpan.span(s, length, spanCondition);
				2224	}
				2225	}
				2226
				2227	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2228	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2229	}
				2230
				2231	UChar32 c;
				2232	int32_t start=0, prev=0;
				2233	do {
				2234	U16_NEXT(s, start, length, c);
				2235	if(spanCondition!=contains(c)) {
				2236	break;
				2237	}
				2238	} while((prev=start)<length);
				2239	return prev;
				2240	}
				2241
				2242	int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
				2243	if(length>0 && bmpSet!=NULL) {
				2244	return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
				2245	}
				2246	if(length<0) {
				2247	length=u_strlen(s);
				2248	}
				2249	if(length==0) {
				2250	return 0;
				2251	}
				2252	if(stringSpan!=NULL) {
				2253	return stringSpan->spanBack(s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2254	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2255	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2256	UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
				2257	UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
				2258	UnicodeSetStringSpan strSpan(this, strings, which);
				2259	if(strSpan.needsStringSpanUTF16()) {
				2260	return strSpan.spanBack(s, length, spanCondition);
				2261	}
				2262	}
				2263
				2264	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2265	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2266	}
				2267
				2268	UChar32 c;
				2269	int32_t prev=length;
				2270	do {
				2271	U16_PREV(s, 0, length, c);
				2272	if(spanCondition!=contains(c)) {
				2273	break;
				2274	}
				2275	} while((prev=length)>0);
				2276	return prev;
				2277	}
				2278
				2279	int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
				2280	if(length>0 && bmpSet!=NULL) {
				2281	const uint8_t s0=(const uint8_t )s;
				2282	return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
				2283	}
				2284	if(length<0) {
				2285	length=(int32_t)uprv_strlen(s);
				2286	}
				2287	if(length==0) {
				2288	return 0;
				2289	}
				2290	if(stringSpan!=NULL) {
				2291	return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2292	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2293	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2294	UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
				2295	UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
				2296	UnicodeSetStringSpan strSpan(this, strings, which);
				2297	if(strSpan.needsStringSpanUTF8()) {
				2298	return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
				2299	}
				2300	}
				2301
				2302	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2303	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2304	}
				2305
				2306	UChar32 c;
				2307	int32_t start=0, prev=0;
				2308	do {
				2309	U8_NEXT_OR_FFFD(s, start, length, c);
				2310	if(spanCondition!=contains(c)) {
				2311	break;
				2312	}
				2313	} while((prev=start)<length);
				2314	return prev;
				2315	}
				2316
				2317	int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
				2318	if(length>0 && bmpSet!=NULL) {
				2319	const uint8_t s0=(const uint8_t )s;
				2320	return bmpSet->spanBackUTF8(s0, length, spanCondition);
				2321	}
				2322	if(length<0) {
				2323	length=(int32_t)uprv_strlen(s);
				2324	}
				2325	if(length==0) {
				2326	return 0;
				2327	}
				2328	if(stringSpan!=NULL) {
				2329	return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
Jungshik Shin	d13a96f	2018-11-14 09:22:09 -0800	[diff] [blame]	2330	} else if(hasStrings()) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2331	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
				2332	UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
				2333	UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
				2334	UnicodeSetStringSpan strSpan(this, strings, which);
				2335	if(strSpan.needsStringSpanUTF8()) {
				2336	return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
				2337	}
				2338	}
				2339
				2340	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
				2341	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
				2342	}
				2343
				2344	UChar32 c;
				2345	int32_t prev=length;
				2346	do {
				2347	U8_PREV_OR_FFFD(s, 0, length, c);
				2348	if(spanCondition!=contains(c)) {
				2349	break;
				2350	}
				2351	} while((prev=length)>0);
				2352	return prev;
				2353	}
				2354
				2355	U_NAMESPACE_END