Blame - source/common/uniset_props.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 6cfd80a705b8fcb4911b94ab29c44d6c315e7653 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	*******************************************************************************
				5	*
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	6	* Copyright (C) 1999-2014, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	7	* Corporation and others. All Rights Reserved.
				8	*
				9	*******************************************************************************
				10	* file name: uniset_props.cpp
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	11	* encoding: UTF-8
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	12	* tab size: 8 (not used)
				13	* indentation:4
				14	*
				15	* created on: 2004aug25
				16	* created by: Markus W. Scherer
				17	*
				18	* Character property dependent functions moved here from uniset.cpp
				19	*/
				20
				21	#include "unicode/utypes.h"
				22	#include "unicode/uniset.h"
				23	#include "unicode/parsepos.h"
				24	#include "unicode/uchar.h"
				25	#include "unicode/uscript.h"
				26	#include "unicode/symtable.h"
				27	#include "unicode/uset.h"
				28	#include "unicode/locid.h"
				29	#include "unicode/brkiter.h"
				30	#include "uset_imp.h"
				31	#include "ruleiter.h"
				32	#include "cmemory.h"
				33	#include "ucln_cmn.h"
				34	#include "util.h"
				35	#include "uvector.h"
				36	#include "uprops.h"
				37	#include "propname.h"
				38	#include "normalizer2impl.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	39	#include "uinvchar.h"
				40	#include "uprops.h"
				41	#include "charstr.h"
				42	#include "cstring.h"
				43	#include "mutex.h"
				44	#include "umutex.h"
				45	#include "uassert.h"
				46	#include "hash.h"
				47
				48	U_NAMESPACE_USE
				49
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	50	// initial storage. Must be >= 0
				51	// * same as in uniset.cpp ! *
				52	#define START_EXTRA 16
				53
				54	// Define UChar constants using hex for EBCDIC compatibility
				55	// Used #define to reduce private static exports and memory access time.
				56	#define SET_OPEN ((UChar)0x005B) /[/
				57	#define SET_CLOSE ((UChar)0x005D) /]/
				58	#define HYPHEN ((UChar)0x002D) /-/
				59	#define COMPLEMENT ((UChar)0x005E) /^/
				60	#define COLON ((UChar)0x003A) /:/
				61	#define BACKSLASH ((UChar)0x005C) /\/
				62	#define INTERSECTION ((UChar)0x0026) /&/
				63	#define UPPER_U ((UChar)0x0055) /U/
				64	#define LOWER_U ((UChar)0x0075) /u/
				65	#define OPEN_BRACE ((UChar)123) /{/
				66	#define CLOSE_BRACE ((UChar)125) /}/
				67	#define UPPER_P ((UChar)0x0050) /P/
				68	#define LOWER_P ((UChar)0x0070) /p/
				69	#define UPPER_N ((UChar)78) /N/
				70	#define EQUALS ((UChar)0x003D) /=/
				71
				72	//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
				73	static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
				74	//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
				75	//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
				76	//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
				77	static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /-]/
				78
				79	// Special property set IDs
				80	static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
				81	static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
				82	static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
				83
				84	// Unicode name property alias
				85	#define NAME_PROP "na"
				86	#define NAME_PROP_LENGTH 2
				87
				88	/**
				89	* Delimiter string used in patterns to close a category reference:
				90	* ":]". Example: "[:Lu:]".
				91	*/
				92	//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
				93
				94	// Cached sets ------------------------------------------------------------- ***
				95
				96	U_CDECL_BEGIN
				97	static UBool U_CALLCONV uset_cleanup();
				98
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	99	static UnicodeSet *uni32Singleton;
				100	static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
				101
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	102	/**
				103	* Cleanup function for UnicodeSet
				104	*/
				105	static UBool U_CALLCONV uset_cleanup(void) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	106	delete uni32Singleton;
				107	uni32Singleton = NULL;
				108	uni32InitOnce.reset();
				109	return TRUE;
				110	}
				111
				112	U_CDECL_END
				113
				114	U_NAMESPACE_BEGIN
				115
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	116	namespace {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	117
				118	// Cache some sets for other services -------------------------------------- ***
				119	void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
				120	U_ASSERT(uni32Singleton == NULL);
				121	uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
				122	if(uni32Singleton==NULL) {
				123	errorCode=U_MEMORY_ALLOCATION_ERROR;
				124	} else {
				125	uni32Singleton->freeze();
				126	}
				127	ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
				128	}
				129
				130
				131	U_CFUNC UnicodeSet *
				132	uniset_getUnicode32Instance(UErrorCode &errorCode) {
				133	umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
				134	return uni32Singleton;
				135	}
				136
				137	// helper functions for matching of pattern syntax pieces ------------------ ***
				138	// these functions are parallel to the PERL_OPEN etc. strings above
				139
				140	// using these functions is not only faster than UnicodeString::compare() and
				141	// caseCompare(), but they also make UnicodeSet work for simple patterns when
				142	// no Unicode properties data is available - when caseCompare() fails
				143
				144	static inline UBool
				145	isPerlOpen(const UnicodeString &pattern, int32_t pos) {
				146	UChar c;
				147	return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P \|\| c==UPPER_P);
				148	}
				149
				150	/*static inline UBool
				151	isPerlClose(const UnicodeString &pattern, int32_t pos) {
				152	return pattern.charAt(pos)==CLOSE_BRACE;
				153	}*/
				154
				155	static inline UBool
				156	isNameOpen(const UnicodeString &pattern, int32_t pos) {
				157	return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
				158	}
				159
				160	static inline UBool
				161	isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
				162	return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
				163	}
				164
				165	/*static inline UBool
				166	isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
				167	return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
				168	}*/
				169
				170	// TODO memory debugging provided inside uniset.cpp
				171	// could be made available here but probably obsolete with use of modern
				172	// memory leak checker tools
				173	#define _dbgct(me)
				174
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	175	} // namespace
				176
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	177	//----------------------------------------------------------------
				178	// Constructors &c
				179	//----------------------------------------------------------------
				180
				181	/**
				182	* Constructs a set from the given pattern, optionally ignoring
				183	* white space. See the class description for the syntax of the
				184	* pattern language.
				185	* @param pattern a string specifying what characters are in the set
				186	*/
				187	UnicodeSet::UnicodeSet(const UnicodeString& pattern,
				188	UErrorCode& status) :
				189	len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
				190	bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
				191	fFlags(0)
				192	{
				193	if(U_SUCCESS(status)){
				194	list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);
				195	/* test for NULL */
				196	if(list == NULL) {
				197	status = U_MEMORY_ALLOCATION_ERROR;
				198	}else{
				199	allocateStrings(status);
				200	applyPattern(pattern, status);
				201	}
				202	}
				203	_dbgct(this);
				204	}
				205
				206	//----------------------------------------------------------------
				207	// Public API
				208	//----------------------------------------------------------------
				209
				210	UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
				211	UErrorCode& status) {
				212	// Equivalent to
				213	// return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
				214	// but without dependency on closeOver().
				215	ParsePosition pos(0);
				216	applyPatternIgnoreSpace(pattern, pos, NULL, status);
				217	if (U_FAILURE(status)) return *this;
				218
				219	int32_t i = pos.getIndex();
				220	// Skip over trailing whitespace
				221	ICU_Utility::skipWhitespace(pattern, i, TRUE);
				222	if (i != pattern.length()) {
				223	status = U_ILLEGAL_ARGUMENT_ERROR;
				224	}
				225	return *this;
				226	}
				227
				228	void
				229	UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
				230	ParsePosition& pos,
				231	const SymbolTable* symbols,
				232	UErrorCode& status) {
				233	if (U_FAILURE(status)) {
				234	return;
				235	}
				236	if (isFrozen()) {
				237	status = U_NO_WRITE_PERMISSION;
				238	return;
				239	}
				240	// Need to build the pattern in a temporary string because
				241	// _applyPattern calls add() etc., which set pat to empty.
				242	UnicodeString rebuiltPat;
				243	RuleCharacterIterator chars(pattern, symbols, pos);
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	244	applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	245	if (U_FAILURE(status)) return;
				246	if (chars.inVariable()) {
				247	// syntaxError(chars, "Extra chars in variable value");
				248	status = U_MALFORMED_SET;
				249	return;
				250	}
				251	setPattern(rebuiltPat);
				252	}
				253
				254	/**
				255	* Return true if the given position, in the given pattern, appears
				256	* to be the start of a UnicodeSet pattern.
				257	*/
				258	UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
				259	return ((pos+1) < pattern.length() &&
				260	pattern.charAt(pos) == (UChar)91/[/) \|\|
				261	resemblesPropertyPattern(pattern, pos);
				262	}
				263
				264	//----------------------------------------------------------------
				265	// Implementation: Pattern parsing
				266	//----------------------------------------------------------------
				267
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	268	namespace {
				269
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	270	/**
				271	* A small all-inline class to manage a UnicodeSet pointer. Add
				272	* operator->() etc. as needed.
				273	*/
				274	class UnicodeSetPointer {
				275	UnicodeSet* p;
				276	public:
				277	inline UnicodeSetPointer() : p(0) {}
				278	inline ~UnicodeSetPointer() { delete p; }
				279	inline UnicodeSet* pointer() { return p; }
				280	inline UBool allocate() {
				281	if (p == 0) {
				282	p = new UnicodeSet();
				283	}
				284	return p != 0;
				285	}
				286	};
				287
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	288	constexpr int32_t MAX_DEPTH = 100;
				289
				290	} // namespace
				291
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	292	/**
				293	* Parse the pattern from the given RuleCharacterIterator. The
				294	* iterator is advanced over the parsed pattern.
				295	* @param chars iterator over the pattern characters. Upon return
				296	* it will be advanced to the first character after the parsed
				297	* pattern, or the end of the iteration if all characters are
				298	* parsed.
				299	* @param symbols symbol table to use to parse and dereference
				300	* variables, or null if none.
				301	* @param rebuiltPat the pattern that was parsed, rebuilt or
				302	* copied from the input pattern, as appropriate.
				303	* @param options a bit mask of zero or more of the following:
				304	* IGNORE_SPACE, CASE.
				305	*/
				306	void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
				307	const SymbolTable* symbols,
				308	UnicodeString& rebuiltPat,
				309	uint32_t options,
				310	UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	311	int32_t depth,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	312	UErrorCode& ec) {
				313	if (U_FAILURE(ec)) return;
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	314	if (depth > MAX_DEPTH) {
				315	ec = U_ILLEGAL_ARGUMENT_ERROR;
				316	return;
				317	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	318
				319	// Syntax characters: [ ] ^ - & { }
				320
				321	// Recognized special forms for chars, sets: c-c s-s s&s
				322
				323	int32_t opts = RuleCharacterIterator::PARSE_VARIABLES \|
				324	RuleCharacterIterator::PARSE_ESCAPES;
				325	if ((options & USET_IGNORE_SPACE) != 0) {
				326	opts \|= RuleCharacterIterator::SKIP_WHITESPACE;
				327	}
				328
				329	UnicodeString patLocal, buf;
				330	UBool usePat = FALSE;
				331	UnicodeSetPointer scratch;
				332	RuleCharacterIterator::Pos backup;
				333
				334	// mode: 0=before [, 1=between [...], 2=after ]
				335	// lastItem: 0=none, 1=char, 2=set
				336	int8_t lastItem = 0, mode = 0;
				337	UChar32 lastChar = 0;
				338	UChar op = 0;
				339
				340	UBool invert = FALSE;
				341
				342	clear();
				343
				344	while (mode != 2 && !chars.atEnd()) {
				345	U_ASSERT((lastItem == 0 && op == 0) \|\|
				346	(lastItem == 1 && (op == 0 \|\| op == HYPHEN /'-'/)) \|\|
				347	(lastItem == 2 && (op == 0 \|\| op == HYPHEN /'-'/ \|\|
				348	op == INTERSECTION /'&'/)));
				349
				350	UChar32 c = 0;
				351	UBool literal = FALSE;
				352	UnicodeSet* nested = 0; // alias - do not delete
				353
				354	// -------- Check for property pattern
				355
				356	// setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
				357	int8_t setMode = 0;
				358	if (resemblesPropertyPattern(chars, opts)) {
				359	setMode = 2;
				360	}
				361
				362	// -------- Parse '[' of opening delimiter OR nested set.
				363	// If there is a nested set, use `setMode' to define how
				364	// the set should be parsed. If the '[' is part of the
				365	// opening delimiter for this pattern, parse special
				366	// strings "[", "[^", "[-", and "[^-". Check for stand-in
				367	// characters representing a nested set in the symbol
				368	// table.
				369
				370	else {
				371	// Prepare to backup if necessary
				372	chars.getPos(backup);
				373	c = chars.next(opts, literal, ec);
				374	if (U_FAILURE(ec)) return;
				375
				376	if (c == 0x5B /'['/ && !literal) {
				377	if (mode == 1) {
				378	chars.setPos(backup); // backup
				379	setMode = 1;
				380	} else {
				381	// Handle opening '[' delimiter
				382	mode = 1;
				383	patLocal.append((UChar) 0x5B /'['/);
				384	chars.getPos(backup); // prepare to backup
				385	c = chars.next(opts, literal, ec);
				386	if (U_FAILURE(ec)) return;
				387	if (c == 0x5E /'^'/ && !literal) {
				388	invert = TRUE;
				389	patLocal.append((UChar) 0x5E /'^'/);
				390	chars.getPos(backup); // prepare to backup
				391	c = chars.next(opts, literal, ec);
				392	if (U_FAILURE(ec)) return;
				393	}
				394	// Fall through to handle special leading '-';
				395	// otherwise restart loop for nested [], \p{}, etc.
				396	if (c == HYPHEN /'-'/) {
				397	literal = TRUE;
				398	// Fall through to handle literal '-' below
				399	} else {
				400	chars.setPos(backup); // backup
				401	continue;
				402	}
				403	}
				404	} else if (symbols != 0) {
				405	const UnicodeFunctor *m = symbols->lookupMatcher(c);
				406	if (m != 0) {
				407	const UnicodeSet ms = dynamic_cast<const UnicodeSet >(m);
				408	if (ms == NULL) {
				409	ec = U_MALFORMED_SET;
				410	return;
				411	}
				412	// casting away const, but `nested' won't be modified
				413	// (important not to modify stored set)
				414	nested = const_cast<UnicodeSet*>(ms);
				415	setMode = 3;
				416	}
				417	}
				418	}
				419
				420	// -------- Handle a nested set. This either is inline in
				421	// the pattern or represented by a stand-in that has
				422	// previously been parsed and was looked up in the symbol
				423	// table.
				424
				425	if (setMode != 0) {
				426	if (lastItem == 1) {
				427	if (op != 0) {
				428	// syntaxError(chars, "Char expected after operator");
				429	ec = U_MALFORMED_SET;
				430	return;
				431	}
				432	add(lastChar, lastChar);
				433	_appendToPat(patLocal, lastChar, FALSE);
				434	lastItem = 0;
				435	op = 0;
				436	}
				437
				438	if (op == HYPHEN /'-'/ \|\| op == INTERSECTION /'&'/) {
				439	patLocal.append(op);
				440	}
				441
				442	if (nested == 0) {
				443	// lazy allocation
				444	if (!scratch.allocate()) {
				445	ec = U_MEMORY_ALLOCATION_ERROR;
				446	return;
				447	}
				448	nested = scratch.pointer();
				449	}
				450	switch (setMode) {
				451	case 1:
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	452	nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	453	break;
				454	case 2:
				455	chars.skipIgnored(opts);
				456	nested->applyPropertyPattern(chars, patLocal, ec);
				457	if (U_FAILURE(ec)) return;
				458	break;
				459	case 3: // `nested' already parsed
				460	nested->_toPattern(patLocal, FALSE);
				461	break;
				462	}
				463
				464	usePat = TRUE;
				465
				466	if (mode == 0) {
				467	// Entire pattern is a category; leave parse loop
				468	this = nested;
				469	mode = 2;
				470	break;
				471	}
				472
				473	switch (op) {
				474	case HYPHEN: /'-'/
				475	removeAll(*nested);
				476	break;
				477	case INTERSECTION: /'&'/
				478	retainAll(*nested);
				479	break;
				480	case 0:
				481	addAll(*nested);
				482	break;
				483	}
				484
				485	op = 0;
				486	lastItem = 2;
				487
				488	continue;
				489	}
				490
				491	if (mode == 0) {
				492	// syntaxError(chars, "Missing '['");
				493	ec = U_MALFORMED_SET;
				494	return;
				495	}
				496
				497	// -------- Parse special (syntax) characters. If the
				498	// current character is not special, or if it is escaped,
				499	// then fall through and handle it below.
				500
				501	if (!literal) {
				502	switch (c) {
				503	case 0x5D /']'/:
				504	if (lastItem == 1) {
				505	add(lastChar, lastChar);
				506	_appendToPat(patLocal, lastChar, FALSE);
				507	}
				508	// Treat final trailing '-' as a literal
				509	if (op == HYPHEN /'-'/) {
				510	add(op, op);
				511	patLocal.append(op);
				512	} else if (op == INTERSECTION /'&'/) {
				513	// syntaxError(chars, "Trailing '&'");
				514	ec = U_MALFORMED_SET;
				515	return;
				516	}
				517	patLocal.append((UChar) 0x5D /']'/);
				518	mode = 2;
				519	continue;
				520	case HYPHEN /'-'/:
				521	if (op == 0) {
				522	if (lastItem != 0) {
				523	op = (UChar) c;
				524	continue;
				525	} else {
				526	// Treat final trailing '-' as a literal
				527	add(c, c);
				528	c = chars.next(opts, literal, ec);
				529	if (U_FAILURE(ec)) return;
				530	if (c == 0x5D /']'/ && !literal) {
				531	patLocal.append(HYPHEN_RIGHT_BRACE, 2);
				532	mode = 2;
				533	continue;
				534	}
				535	}
				536	}
				537	// syntaxError(chars, "'-' not after char or set");
				538	ec = U_MALFORMED_SET;
				539	return;
				540	case INTERSECTION /'&'/:
				541	if (lastItem == 2 && op == 0) {
				542	op = (UChar) c;
				543	continue;
				544	}
				545	// syntaxError(chars, "'&' not after set");
				546	ec = U_MALFORMED_SET;
				547	return;
				548	case 0x5E /'^'/:
				549	// syntaxError(chars, "'^' not after '['");
				550	ec = U_MALFORMED_SET;
				551	return;
				552	case 0x7B /'{'/:
				553	if (op != 0) {
				554	// syntaxError(chars, "Missing operand after operator");
				555	ec = U_MALFORMED_SET;
				556	return;
				557	}
				558	if (lastItem == 1) {
				559	add(lastChar, lastChar);
				560	_appendToPat(patLocal, lastChar, FALSE);
				561	}
				562	lastItem = 0;
				563	buf.truncate(0);
				564	{
				565	UBool ok = FALSE;
				566	while (!chars.atEnd()) {
				567	c = chars.next(opts, literal, ec);
				568	if (U_FAILURE(ec)) return;
				569	if (c == 0x7D /'}'/ && !literal) {
				570	ok = TRUE;
				571	break;
				572	}
				573	buf.append(c);
				574	}
				575	if (buf.length() < 1 \|\| !ok) {
				576	// syntaxError(chars, "Invalid multicharacter string");
				577	ec = U_MALFORMED_SET;
				578	return;
				579	}
				580	}
				581	// We have new string. Add it to set and continue;
				582	// we don't need to drop through to the further
				583	// processing
				584	add(buf);
				585	patLocal.append((UChar) 0x7B /'{'/);
				586	_appendToPat(patLocal, buf, FALSE);
				587	patLocal.append((UChar) 0x7D /'}'/);
				588	continue;
				589	case SymbolTable::SYMBOL_REF:
				590	// symbols nosymbols
				591	// [a-$] error error (ambiguous)
				592	// [a$] anchor anchor
				593	// [a-$x] var "x"* literal '$'
				594	// [a-$.] error literal '$'
				595	// *We won't get here in the case of var "x"
				596	{
				597	chars.getPos(backup);
				598	c = chars.next(opts, literal, ec);
				599	if (U_FAILURE(ec)) return;
				600	UBool anchor = (c == 0x5D /']'/ && !literal);
				601	if (symbols == 0 && !anchor) {
				602	c = SymbolTable::SYMBOL_REF;
				603	chars.setPos(backup);
				604	break; // literal '$'
				605	}
				606	if (anchor && op == 0) {
				607	if (lastItem == 1) {
				608	add(lastChar, lastChar);
				609	_appendToPat(patLocal, lastChar, FALSE);
				610	}
				611	add(U_ETHER);
				612	usePat = TRUE;
				613	patLocal.append((UChar) SymbolTable::SYMBOL_REF);
				614	patLocal.append((UChar) 0x5D /']'/);
				615	mode = 2;
				616	continue;
				617	}
				618	// syntaxError(chars, "Unquoted '$'");
				619	ec = U_MALFORMED_SET;
				620	return;
				621	}
				622	default:
				623	break;
				624	}
				625	}
				626
				627	// -------- Parse literal characters. This includes both
				628	// escaped chars ("\u4E01") and non-syntax characters
				629	// ("a").
				630
				631	switch (lastItem) {
				632	case 0:
				633	lastItem = 1;
				634	lastChar = c;
				635	break;
				636	case 1:
				637	if (op == HYPHEN /'-'/) {
				638	if (lastChar >= c) {
				639	// Don't allow redundant (a-a) or empty (b-a) ranges;
				640	// these are most likely typos.
				641	// syntaxError(chars, "Invalid range");
				642	ec = U_MALFORMED_SET;
				643	return;
				644	}
				645	add(lastChar, c);
				646	_appendToPat(patLocal, lastChar, FALSE);
				647	patLocal.append(op);
				648	_appendToPat(patLocal, c, FALSE);
				649	lastItem = 0;
				650	op = 0;
				651	} else {
				652	add(lastChar, lastChar);
				653	_appendToPat(patLocal, lastChar, FALSE);
				654	lastChar = c;
				655	}
				656	break;
				657	case 2:
				658	if (op != 0) {
				659	// syntaxError(chars, "Set expected after operator");
				660	ec = U_MALFORMED_SET;
				661	return;
				662	}
				663	lastChar = c;
				664	lastItem = 1;
				665	break;
				666	}
				667	}
				668
				669	if (mode != 2) {
				670	// syntaxError(chars, "Missing ']'");
				671	ec = U_MALFORMED_SET;
				672	return;
				673	}
				674
				675	chars.skipIgnored(opts);
				676
				677	/**
				678	* Handle global flags (invert, case insensitivity). If this
				679	* pattern should be compiled case-insensitive, then we need
				680	* to close over case BEFORE COMPLEMENTING. This makes
				681	* patterns like /[^abc]/i work.
				682	*/
				683	if ((options & USET_CASE_INSENSITIVE) != 0) {
				684	(this->*caseClosure)(USET_CASE_INSENSITIVE);
				685	}
				686	else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
				687	(this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
				688	}
				689	if (invert) {
				690	complement();
				691	}
				692
				693	// Use the rebuilt pattern (patLocal) only if necessary. Prefer the
				694	// generated pattern.
				695	if (usePat) {
				696	rebuiltPat.append(patLocal);
				697	} else {
				698	_generatePattern(rebuiltPat, FALSE);
				699	}
				700	if (isBogus() && U_SUCCESS(ec)) {
				701	// We likely ran out of memory. AHHH!
				702	ec = U_MEMORY_ALLOCATION_ERROR;
				703	}
				704	}
				705
				706	//----------------------------------------------------------------
				707	// Property set implementation
				708	//----------------------------------------------------------------
				709
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	710	namespace {
				711
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	712	static UBool numericValueFilter(UChar32 ch, void* context) {
				713	return u_getNumericValue(ch) == (double)context;
				714	}
				715
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	716	static UBool versionFilter(UChar32 ch, void* context) {
				717	static const UVersionInfo none = { 0, 0, 0, 0 };
				718	UVersionInfo v;
				719	u_charAge(ch, v);
				720	UVersionInfo* version = (UVersionInfo*)context;
				721	return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
				722	}
				723
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	724	static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
				725	return uscript_hasScript(ch, (UScriptCode)context);
				726	}
				727
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	728	} // namespace
				729
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	730	/**
				731	* Generic filter-based scanning code for UCD property UnicodeSets.
				732	*/
				733	void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
				734	void* context,
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	735	const UnicodeSet* inclusions,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	736	UErrorCode &status) {
				737	if (U_FAILURE(status)) return;
				738
				739	// Logically, walk through all Unicode characters, noting the start
				740	// and end of each range for which filter.contain(c) is
				741	// true. Add each range to a set.
				742	//
				743	// To improve performance, use an inclusions set which
				744	// encodes information about character ranges that are known
				745	// to have identical properties.
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	746	// inclusions contains the first characters of
				747	// same-value ranges for the given property.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	748
				749	clear();
				750
				751	UChar32 startHasProperty = -1;
				752	int32_t limitRange = inclusions->getRangeCount();
				753
				754	for (int j=0; j<limitRange; ++j) {
				755	// get current range
				756	UChar32 start = inclusions->getRangeStart(j);
				757	UChar32 end = inclusions->getRangeEnd(j);
				758
				759	// for all the code points in the range, process
				760	for (UChar32 ch = start; ch <= end; ++ch) {
				761	// only add to this UnicodeSet on inflection points --
				762	// where the hasProperty value changes to false
				763	if ((*filter)(ch, context)) {
				764	if (startHasProperty < 0) {
				765	startHasProperty = ch;
				766	}
				767	} else if (startHasProperty >= 0) {
				768	add(startHasProperty, ch-1);
				769	startHasProperty = -1;
				770	}
				771	}
				772	}
				773	if (startHasProperty >= 0) {
				774	add((UChar32)startHasProperty, (UChar32)0x10FFFF);
				775	}
				776	if (isBogus() && U_SUCCESS(status)) {
				777	// We likely ran out of memory. AHHH!
				778	status = U_MEMORY_ALLOCATION_ERROR;
				779	}
				780	}
				781
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	782	namespace {
				783
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	784	/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
				785	uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) {
				786	uint32_t mask = (const uint32_t )context;
				787	value = U_MASK(value) & mask;
				788	if (value != 0) { value = 1; }
				789	return value;
				790	}
				791
				792	/** Maps one map value to 1, all others to 0. */
				793	uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) {
				794	uint32_t v = (const uint32_t )context;
				795	return value == v ? 1 : 0;
				796	}
				797
				798	} // namespace
				799
				800	void UnicodeSet::applyIntPropertyValue(const UCPMap *map,
				801	UCPMapValueFilter filter, const void context,
				802	UErrorCode &errorCode) {
				803	if (U_FAILURE(errorCode)) { return; }
				804	clear();
				805	UChar32 start = 0, end;
				806	uint32_t value;
				807	while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
				808	filter, context, &value)) >= 0) {
				809	if (value != 0) {
				810	add(start, end);
				811	}
				812	start = end + 1;
				813	}
				814	if (isBogus()) {
				815	errorCode = U_MEMORY_ALLOCATION_ERROR;
				816	}
				817	}
				818
				819	namespace {
				820
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	821	static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
				822	/* Note: we use ' ' in compiler code page */
				823	int32_t j = 0;
				824	char ch;
				825	--dstCapacity; /* make room for term. zero */
				826	while ((ch = *src++) != 0) {
				827	if (ch == ' ' && (j==0 \|\| (j>0 && dst[j-1]==' '))) {
				828	continue;
				829	}
				830	if (j >= dstCapacity) return FALSE;
				831	dst[j++] = ch;
				832	}
				833	if (j > 0 && dst[j-1] == ' ') --j;
				834	dst[j] = 0;
				835	return TRUE;
				836	}
				837
Jungshik Shin	aff99f5	2018-04-11 17:29:08 -0700	[diff] [blame]	838	} // namespace
				839
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	840	//----------------------------------------------------------------
				841	// Property set API
				842	//----------------------------------------------------------------
				843
				844	#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
				845
				846	UnicodeSet&
				847	UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	848	if (U_FAILURE(ec)) { return *this; }
				849	// All of the following check isFrozen() before modifying this set.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	850	if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	851	const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec);
				852	applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	853	} else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	854	const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	855	UScriptCode script = (UScriptCode)value;
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	856	applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
				857	} else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
				858	if (value == 0 \|\| value == 1) {
				859	const USet *set = u_getBinaryPropertySet(prop, &ec);
				860	if (U_FAILURE(ec)) { return *this; }
				861	copyFrom(*UnicodeSet::fromUSet(set), TRUE);
				862	if (value == 0) {
				863	complement();
				864	}
				865	} else {
				866	clear();
				867	}
				868	} else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
				869	const UCPMap *map = u_getIntPropertyMap(prop, &ec);
				870	applyIntPropertyValue(map, intValueFilter, &value, ec);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	871	} else {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	872	// This code used to always call getInclusions(property source)
				873	// which sets an error for an unsupported property.
				874	ec = U_ILLEGAL_ARGUMENT_ERROR;
				875	// Otherwise we would just clear() this set because
				876	// getIntPropertyValue(c, prop) returns 0 for all code points.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	877	}
				878	return *this;
				879	}
				880
				881	UnicodeSet&
				882	UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
				883	const UnicodeString& value,
				884	UErrorCode& ec) {
				885	if (U_FAILURE(ec) \|\| isFrozen()) return *this;
				886
				887	// prop and value used to be converted to char * using the default
				888	// converter instead of the invariant conversion.
				889	// This should not be necessary because all Unicode property and value
				890	// names use only invariant characters.
				891	// If there are any variant characters, then we won't find them anyway.
				892	// Checking first avoids assertion failures in the conversion.
				893	if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) \|\|
				894	!uprv_isInvariantUString(value.getBuffer(), value.length())
				895	) {
				896	FAIL(ec);
				897	}
				898	CharString pname, vname;
				899	pname.appendInvariantChars(prop, ec);
				900	vname.appendInvariantChars(value, ec);
				901	if (U_FAILURE(ec)) return *this;
				902
				903	UProperty p;
				904	int32_t v;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	905	UBool invert = FALSE;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	906
				907	if (value.length() > 0) {
				908	p = u_getPropertyEnum(pname.data());
				909	if (p == UCHAR_INVALID_CODE) FAIL(ec);
				910
				911	// Treat gc as gcm
				912	if (p == UCHAR_GENERAL_CATEGORY) {
				913	p = UCHAR_GENERAL_CATEGORY_MASK;
				914	}
				915
				916	if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) \|\|
				917	(p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) \|\|
				918	(p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
				919	v = u_getPropertyValueEnum(p, vname.data());
				920	if (v == UCHAR_INVALID_CODE) {
				921	// Handle numeric CCC
				922	if (p == UCHAR_CANONICAL_COMBINING_CLASS \|\|
				923	p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS \|\|
				924	p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
				925	char* end;
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	926	double val = uprv_strtod(vname.data(), &end);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	927	// Anything between 0 and 255 is valid even if unused.
				928	// Cast double->int only after range check.
				929	// We catch NaN here because comparing it with both 0 and 255 will be false
				930	// (as are all comparisons with NaN).
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	931	if (*end != 0 \|\| !(0 <= val && val <= 255) \|\|
				932	(v = (int32_t)val) != val) {
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	933	// non-integral value or outside 0..255, or trailing junk
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	934	FAIL(ec);
				935	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	936	} else {
				937	FAIL(ec);
				938	}
				939	}
				940	}
				941
				942	else {
				943
				944	switch (p) {
				945	case UCHAR_NUMERIC_VALUE:
				946	{
				947	char* end;
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	948	double val = uprv_strtod(vname.data(), &end);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	949	if (*end != 0) {
				950	FAIL(ec);
				951	}
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	952	applyFilter(numericValueFilter, &val,
				953	CharacterProperties::getInclusionsForProperty(p, ec), ec);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	954	return *this;
				955	}
				956	case UCHAR_NAME:
				957	{
				958	// Must munge name, since u_charFromName() does not do
				959	// 'loose' matching.
				960	char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
				961	if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
				962	UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
				963	if (U_SUCCESS(ec)) {
				964	clear();
				965	add(ch);
				966	return *this;
				967	} else {
				968	FAIL(ec);
				969	}
				970	}
				971	case UCHAR_UNICODE_1_NAME:
				972	// ICU 49 deprecates the Unicode_1_Name property APIs.
				973	FAIL(ec);
				974	case UCHAR_AGE:
				975	{
				976	// Must munge name, since u_versionFromString() does not do
				977	// 'loose' matching.
				978	char buf[128];
				979	if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
				980	UVersionInfo version;
				981	u_versionFromString(version, buf);
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	982	applyFilter(versionFilter, &version,
				983	CharacterProperties::getInclusionsForProperty(p, ec), ec);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	984	return *this;
				985	}
				986	case UCHAR_SCRIPT_EXTENSIONS:
				987	v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
				988	if (v == UCHAR_INVALID_CODE) {
				989	FAIL(ec);
				990	}
				991	// fall through to calling applyIntPropertyValue()
				992	break;
				993	default:
				994	// p is a non-binary, non-enumerated property that we
				995	// don't support (yet).
				996	FAIL(ec);
				997	}
				998	}
				999	}
				1000
				1001	else {
				1002	// value is empty. Interpret as General Category, Script, or
				1003	// Binary property.
				1004	p = UCHAR_GENERAL_CATEGORY_MASK;
				1005	v = u_getPropertyValueEnum(p, pname.data());
				1006	if (v == UCHAR_INVALID_CODE) {
				1007	p = UCHAR_SCRIPT;
				1008	v = u_getPropertyValueEnum(p, pname.data());
				1009	if (v == UCHAR_INVALID_CODE) {
				1010	p = u_getPropertyEnum(pname.data());
				1011	if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
				1012	v = 1;
				1013	} else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
				1014	set(MIN_VALUE, MAX_VALUE);
				1015	return *this;
				1016	} else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
				1017	set(0, 0x7F);
				1018	return *this;
				1019	} else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
				1020	// [:Assigned:]=[:^Cn:]
				1021	p = UCHAR_GENERAL_CATEGORY_MASK;
				1022	v = U_GC_CN_MASK;
				1023	invert = TRUE;
				1024	} else {
				1025	FAIL(ec);
				1026	}
				1027	}
				1028	}
				1029	}
				1030
				1031	applyIntPropertyValue(p, v, ec);
				1032	if(invert) {
				1033	complement();
				1034	}
				1035
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1036	if (isBogus() && U_SUCCESS(ec)) {
				1037	// We likely ran out of memory. AHHH!
				1038	ec = U_MEMORY_ALLOCATION_ERROR;
				1039	}
				1040	return *this;
				1041	}
				1042
				1043	//----------------------------------------------------------------
				1044	// Property set patterns
				1045	//----------------------------------------------------------------
				1046
				1047	/**
				1048	* Return true if the given position, in the given pattern, appears
				1049	* to be the start of a property set pattern.
				1050	*/
				1051	UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
				1052	int32_t pos) {
				1053	// Patterns are at least 5 characters long
				1054	if ((pos+5) > pattern.length()) {
				1055	return FALSE;
				1056	}
				1057
				1058	// Look for an opening [:, [:^, \p, or \P
				1059	return isPOSIXOpen(pattern, pos) \|\| isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos);
				1060	}
				1061
				1062	/**
				1063	* Return true if the given iterator appears to point at a
				1064	* property pattern. Regardless of the result, return with the
				1065	* iterator unchanged.
				1066	* @param chars iterator over the pattern characters. Upon return
				1067	* it will be unchanged.
				1068	* @param iterOpts RuleCharacterIterator options
				1069	*/
				1070	UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
				1071	int32_t iterOpts) {
				1072	// NOTE: literal will always be FALSE, because we don't parse escapes.
				1073	UBool result = FALSE, literal;
				1074	UErrorCode ec = U_ZERO_ERROR;
				1075	iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
				1076	RuleCharacterIterator::Pos pos;
				1077	chars.getPos(pos);
				1078	UChar32 c = chars.next(iterOpts, literal, ec);
				1079	if (c == 0x5B /'['/ \|\| c == 0x5C /'\\'/) {
				1080	UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
				1081	literal, ec);
				1082	result = (c == 0x5B /'['/) ? (d == 0x3A /':'/) :
				1083	(d == 0x4E /'N'/ \|\| d == 0x70 /'p'/ \|\| d == 0x50 /'P'/);
				1084	}
				1085	chars.setPos(pos);
				1086	return result && U_SUCCESS(ec);
				1087	}
				1088
				1089	/**
				1090	* Parse the given property pattern at the given parse position.
				1091	*/
				1092	UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
				1093	ParsePosition& ppos,
				1094	UErrorCode &ec) {
				1095	int32_t pos = ppos.getIndex();
				1096
				1097	UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
				1098	UBool isName = FALSE; // true for \N{pat}, o/w false
				1099	UBool invert = FALSE;
				1100
				1101	if (U_FAILURE(ec)) return *this;
				1102
				1103	// Minimum length is 5 characters, e.g. \p{L}
				1104	if ((pos+5) > pattern.length()) {
				1105	FAIL(ec);
				1106	}
				1107
				1108	// On entry, ppos should point to one of the following locations:
				1109	// Look for an opening [:, [:^, \p, or \P
				1110	if (isPOSIXOpen(pattern, pos)) {
				1111	posix = TRUE;
				1112	pos += 2;
				1113	pos = ICU_Utility::skipWhitespace(pattern, pos);
				1114	if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
				1115	++pos;
				1116	invert = TRUE;
				1117	}
				1118	} else if (isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos)) {
				1119	UChar c = pattern.charAt(pos+1);
				1120	invert = (c == UPPER_P);
				1121	isName = (c == UPPER_N);
				1122	pos += 2;
				1123	pos = ICU_Utility::skipWhitespace(pattern, pos);
				1124	if (pos == pattern.length() \|\| pattern.charAt(pos++) != OPEN_BRACE) {
				1125	// Syntax error; "\p" or "\P" not followed by "{"
				1126	FAIL(ec);
				1127	}
				1128	} else {
				1129	// Open delimiter not seen
				1130	FAIL(ec);
				1131	}
				1132
				1133	// Look for the matching close delimiter, either :] or }
				1134	int32_t close;
				1135	if (posix) {
				1136	close = pattern.indexOf(POSIX_CLOSE, 2, pos);
				1137	} else {
				1138	close = pattern.indexOf(CLOSE_BRACE, pos);
				1139	}
				1140	if (close < 0) {
				1141	// Syntax error; close delimiter missing
				1142	FAIL(ec);
				1143	}
				1144
				1145	// Look for an '=' sign. If this is present, we will parse a
				1146	// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
				1147	// pattern.
				1148	int32_t equals = pattern.indexOf(EQUALS, pos);
				1149	UnicodeString propName, valueName;
				1150	if (equals >= 0 && equals < close && !isName) {
				1151	// Equals seen; parse medium/long pattern
				1152	pattern.extractBetween(pos, equals, propName);
				1153	pattern.extractBetween(equals+1, close, valueName);
				1154	}
				1155
				1156	else {
				1157	// Handle case where no '=' is seen, and \N{}
				1158	pattern.extractBetween(pos, close, propName);
				1159
				1160	// Handle \N{name}
				1161	if (isName) {
				1162	// This is a little inefficient since it means we have to
				1163	// parse NAME_PROP back to UCHAR_NAME even though we already
				1164	// know it's UCHAR_NAME. If we refactor the API to
				1165	// support args of (UProperty, char*) then we can remove
				1166	// NAME_PROP and make this a little more efficient.
				1167	valueName = propName;
				1168	propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
				1169	}
				1170	}
				1171
				1172	applyPropertyAlias(propName, valueName, ec);
				1173
				1174	if (U_SUCCESS(ec)) {
				1175	if (invert) {
				1176	complement();
				1177	}
				1178
				1179	// Move to the limit position after the close delimiter if the
				1180	// parse succeeded.
				1181	ppos.setIndex(close + (posix ? 2 : 1));
				1182	}
				1183
				1184	return *this;
				1185	}
				1186
				1187	/**
				1188	* Parse a property pattern.
				1189	* @param chars iterator over the pattern characters. Upon return
				1190	* it will be advanced to the first character after the parsed
				1191	* pattern, or the end of the iteration if all characters are
				1192	* parsed.
				1193	* @param rebuiltPat the pattern that was parsed, rebuilt or
				1194	* copied from the input pattern, as appropriate.
				1195	*/
				1196	void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
				1197	UnicodeString& rebuiltPat,
				1198	UErrorCode& ec) {
				1199	if (U_FAILURE(ec)) return;
				1200	UnicodeString pattern;
				1201	chars.lookahead(pattern);
				1202	ParsePosition pos(0);
				1203	applyPropertyPattern(pattern, pos, ec);
				1204	if (U_FAILURE(ec)) return;
				1205	if (pos.getIndex() == 0) {
				1206	// syntaxError(chars, "Invalid property pattern");
				1207	ec = U_MALFORMED_SET;
				1208	return;
				1209	}
				1210	chars.jumpahead(pos.getIndex());
				1211	rebuiltPat.append(pattern, 0, pos.getIndex());
				1212	}
				1213
				1214	U_NAMESPACE_END