blob: 48c0a26a710304597256adb0a58bd5eb7fe1e205 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4*******************************************************************************
5*
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08006* Copyright (C) 1999-2014, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uniset_props.cpp
Jungshik Shin87232d82017-05-13 21:10:13 -070011* encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000012* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug25
16* created by: Markus W. Scherer
17*
18* Character property dependent functions moved here from uniset.cpp
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/uniset.h"
23#include "unicode/parsepos.h"
24#include "unicode/uchar.h"
25#include "unicode/uscript.h"
26#include "unicode/symtable.h"
27#include "unicode/uset.h"
28#include "unicode/locid.h"
29#include "unicode/brkiter.h"
30#include "uset_imp.h"
31#include "ruleiter.h"
32#include "cmemory.h"
33#include "ucln_cmn.h"
34#include "util.h"
35#include "uvector.h"
36#include "uprops.h"
37#include "propname.h"
38#include "normalizer2impl.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000039#include "uinvchar.h"
40#include "uprops.h"
41#include "charstr.h"
42#include "cstring.h"
43#include "mutex.h"
44#include "umutex.h"
45#include "uassert.h"
46#include "hash.h"
47
48U_NAMESPACE_USE
49
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000050// Special property set IDs
51static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
52static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
53static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
54
55// Unicode name property alias
56#define NAME_PROP "na"
57#define NAME_PROP_LENGTH 2
58
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000059// Cached sets ------------------------------------------------------------- ***
60
61U_CDECL_BEGIN
62static UBool U_CALLCONV uset_cleanup();
63
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000064static UnicodeSet *uni32Singleton;
Frank Tang1c67b4e2022-05-18 10:13:51 -070065static icu::UInitOnce uni32InitOnce {};
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000066
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000067/**
68 * Cleanup function for UnicodeSet
69 */
70static UBool U_CALLCONV uset_cleanup(void) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000071 delete uni32Singleton;
72 uni32Singleton = NULL;
73 uni32InitOnce.reset();
Frank Tang1f164ee2022-11-08 12:31:27 -080074 return true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000075}
76
77U_CDECL_END
78
79U_NAMESPACE_BEGIN
80
Jungshik Shinaff99f52018-04-11 17:29:08 -070081namespace {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000082
83// Cache some sets for other services -------------------------------------- ***
84void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
85 U_ASSERT(uni32Singleton == NULL);
86 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
87 if(uni32Singleton==NULL) {
88 errorCode=U_MEMORY_ALLOCATION_ERROR;
89 } else {
90 uni32Singleton->freeze();
91 }
92 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
93}
94
95
96U_CFUNC UnicodeSet *
97uniset_getUnicode32Instance(UErrorCode &errorCode) {
98 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
99 return uni32Singleton;
100}
101
102// helper functions for matching of pattern syntax pieces ------------------ ***
103// these functions are parallel to the PERL_OPEN etc. strings above
104
105// using these functions is not only faster than UnicodeString::compare() and
106// caseCompare(), but they also make UnicodeSet work for simple patterns when
107// no Unicode properties data is available - when caseCompare() fails
108
109static inline UBool
110isPerlOpen(const UnicodeString &pattern, int32_t pos) {
111 UChar c;
Frank Tang7e7574b2021-04-13 21:19:13 -0700112 return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000113}
114
115/*static inline UBool
116isPerlClose(const UnicodeString &pattern, int32_t pos) {
Frank Tang7e7574b2021-04-13 21:19:13 -0700117 return pattern.charAt(pos)==u'}';
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000118}*/
119
120static inline UBool
121isNameOpen(const UnicodeString &pattern, int32_t pos) {
Frank Tang7e7574b2021-04-13 21:19:13 -0700122 return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000123}
124
125static inline UBool
126isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
Frank Tang7e7574b2021-04-13 21:19:13 -0700127 return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000128}
129
130/*static inline UBool
131isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
Frank Tang7e7574b2021-04-13 21:19:13 -0700132 return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000133}*/
134
135// TODO memory debugging provided inside uniset.cpp
136// could be made available here but probably obsolete with use of modern
137// memory leak checker tools
138#define _dbgct(me)
139
Jungshik Shinaff99f52018-04-11 17:29:08 -0700140} // namespace
141
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000142//----------------------------------------------------------------
143// Constructors &c
144//----------------------------------------------------------------
145
146/**
147 * Constructs a set from the given pattern, optionally ignoring
148 * white space. See the class description for the syntax of the
149 * pattern language.
150 * @param pattern a string specifying what characters are in the set
151 */
152UnicodeSet::UnicodeSet(const UnicodeString& pattern,
Jungshik Shind13a96f2018-11-14 09:22:09 -0800153 UErrorCode& status) {
154 applyPattern(pattern, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000155 _dbgct(this);
156}
157
158//----------------------------------------------------------------
159// Public API
160//----------------------------------------------------------------
161
162UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
163 UErrorCode& status) {
164 // Equivalent to
165 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
166 // but without dependency on closeOver().
167 ParsePosition pos(0);
168 applyPatternIgnoreSpace(pattern, pos, NULL, status);
169 if (U_FAILURE(status)) return *this;
170
171 int32_t i = pos.getIndex();
172 // Skip over trailing whitespace
Frank Tang1f164ee2022-11-08 12:31:27 -0800173 ICU_Utility::skipWhitespace(pattern, i, true);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000174 if (i != pattern.length()) {
175 status = U_ILLEGAL_ARGUMENT_ERROR;
176 }
177 return *this;
178}
179
180void
181UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
182 ParsePosition& pos,
183 const SymbolTable* symbols,
184 UErrorCode& status) {
185 if (U_FAILURE(status)) {
186 return;
187 }
188 if (isFrozen()) {
189 status = U_NO_WRITE_PERMISSION;
190 return;
191 }
192 // Need to build the pattern in a temporary string because
193 // _applyPattern calls add() etc., which set pat to empty.
194 UnicodeString rebuiltPat;
195 RuleCharacterIterator chars(pattern, symbols, pos);
Jungshik Shinaff99f52018-04-11 17:29:08 -0700196 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000197 if (U_FAILURE(status)) return;
198 if (chars.inVariable()) {
199 // syntaxError(chars, "Extra chars in variable value");
200 status = U_MALFORMED_SET;
201 return;
202 }
203 setPattern(rebuiltPat);
204}
205
206/**
207 * Return true if the given position, in the given pattern, appears
208 * to be the start of a UnicodeSet pattern.
209 */
210UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
211 return ((pos+1) < pattern.length() &&
212 pattern.charAt(pos) == (UChar)91/*[*/) ||
213 resemblesPropertyPattern(pattern, pos);
214}
215
216//----------------------------------------------------------------
217// Implementation: Pattern parsing
218//----------------------------------------------------------------
219
Jungshik Shinaff99f52018-04-11 17:29:08 -0700220namespace {
221
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000222/**
223 * A small all-inline class to manage a UnicodeSet pointer. Add
224 * operator->() etc. as needed.
225 */
226class UnicodeSetPointer {
227 UnicodeSet* p;
228public:
229 inline UnicodeSetPointer() : p(0) {}
230 inline ~UnicodeSetPointer() { delete p; }
231 inline UnicodeSet* pointer() { return p; }
232 inline UBool allocate() {
233 if (p == 0) {
234 p = new UnicodeSet();
235 }
236 return p != 0;
237 }
238};
239
Jungshik Shinaff99f52018-04-11 17:29:08 -0700240constexpr int32_t MAX_DEPTH = 100;
241
242} // namespace
243
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000244/**
245 * Parse the pattern from the given RuleCharacterIterator. The
246 * iterator is advanced over the parsed pattern.
247 * @param chars iterator over the pattern characters. Upon return
248 * it will be advanced to the first character after the parsed
249 * pattern, or the end of the iteration if all characters are
250 * parsed.
251 * @param symbols symbol table to use to parse and dereference
252 * variables, or null if none.
253 * @param rebuiltPat the pattern that was parsed, rebuilt or
254 * copied from the input pattern, as appropriate.
255 * @param options a bit mask of zero or more of the following:
256 * IGNORE_SPACE, CASE.
257 */
258void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
259 const SymbolTable* symbols,
260 UnicodeString& rebuiltPat,
261 uint32_t options,
262 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
Jungshik Shinaff99f52018-04-11 17:29:08 -0700263 int32_t depth,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000264 UErrorCode& ec) {
265 if (U_FAILURE(ec)) return;
Jungshik Shinaff99f52018-04-11 17:29:08 -0700266 if (depth > MAX_DEPTH) {
267 ec = U_ILLEGAL_ARGUMENT_ERROR;
268 return;
269 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000270
271 // Syntax characters: [ ] ^ - & { }
272
273 // Recognized special forms for chars, sets: c-c s-s s&s
274
275 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
276 RuleCharacterIterator::PARSE_ESCAPES;
277 if ((options & USET_IGNORE_SPACE) != 0) {
278 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
279 }
280
281 UnicodeString patLocal, buf;
Frank Tang1f164ee2022-11-08 12:31:27 -0800282 UBool usePat = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000283 UnicodeSetPointer scratch;
284 RuleCharacterIterator::Pos backup;
285
286 // mode: 0=before [, 1=between [...], 2=after ]
287 // lastItem: 0=none, 1=char, 2=set
288 int8_t lastItem = 0, mode = 0;
289 UChar32 lastChar = 0;
290 UChar op = 0;
291
Frank Tang1f164ee2022-11-08 12:31:27 -0800292 UBool invert = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000293
294 clear();
295
296 while (mode != 2 && !chars.atEnd()) {
297 U_ASSERT((lastItem == 0 && op == 0) ||
Frank Tang7e7574b2021-04-13 21:19:13 -0700298 (lastItem == 1 && (op == 0 || op == u'-')) ||
299 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000300
301 UChar32 c = 0;
Frank Tang1f164ee2022-11-08 12:31:27 -0800302 UBool literal = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000303 UnicodeSet* nested = 0; // alias - do not delete
304
305 // -------- Check for property pattern
306
307 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
308 int8_t setMode = 0;
309 if (resemblesPropertyPattern(chars, opts)) {
310 setMode = 2;
311 }
312
313 // -------- Parse '[' of opening delimiter OR nested set.
314 // If there is a nested set, use `setMode' to define how
315 // the set should be parsed. If the '[' is part of the
316 // opening delimiter for this pattern, parse special
317 // strings "[", "[^", "[-", and "[^-". Check for stand-in
318 // characters representing a nested set in the symbol
319 // table.
320
321 else {
322 // Prepare to backup if necessary
323 chars.getPos(backup);
324 c = chars.next(opts, literal, ec);
325 if (U_FAILURE(ec)) return;
326
Frank Tang7e7574b2021-04-13 21:19:13 -0700327 if (c == u'[' && !literal) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000328 if (mode == 1) {
329 chars.setPos(backup); // backup
330 setMode = 1;
331 } else {
332 // Handle opening '[' delimiter
333 mode = 1;
Frank Tang7e7574b2021-04-13 21:19:13 -0700334 patLocal.append(u'[');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000335 chars.getPos(backup); // prepare to backup
336 c = chars.next(opts, literal, ec);
337 if (U_FAILURE(ec)) return;
Frank Tang7e7574b2021-04-13 21:19:13 -0700338 if (c == u'^' && !literal) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800339 invert = true;
Frank Tang7e7574b2021-04-13 21:19:13 -0700340 patLocal.append(u'^');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000341 chars.getPos(backup); // prepare to backup
342 c = chars.next(opts, literal, ec);
343 if (U_FAILURE(ec)) return;
344 }
345 // Fall through to handle special leading '-';
346 // otherwise restart loop for nested [], \p{}, etc.
Frank Tang7e7574b2021-04-13 21:19:13 -0700347 if (c == u'-') {
Frank Tang1f164ee2022-11-08 12:31:27 -0800348 literal = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000349 // Fall through to handle literal '-' below
350 } else {
351 chars.setPos(backup); // backup
352 continue;
353 }
354 }
355 } else if (symbols != 0) {
356 const UnicodeFunctor *m = symbols->lookupMatcher(c);
357 if (m != 0) {
358 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
359 if (ms == NULL) {
360 ec = U_MALFORMED_SET;
361 return;
362 }
363 // casting away const, but `nested' won't be modified
364 // (important not to modify stored set)
365 nested = const_cast<UnicodeSet*>(ms);
366 setMode = 3;
367 }
368 }
369 }
370
371 // -------- Handle a nested set. This either is inline in
372 // the pattern or represented by a stand-in that has
373 // previously been parsed and was looked up in the symbol
374 // table.
375
376 if (setMode != 0) {
377 if (lastItem == 1) {
378 if (op != 0) {
379 // syntaxError(chars, "Char expected after operator");
380 ec = U_MALFORMED_SET;
381 return;
382 }
383 add(lastChar, lastChar);
Frank Tang1f164ee2022-11-08 12:31:27 -0800384 _appendToPat(patLocal, lastChar, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000385 lastItem = 0;
386 op = 0;
387 }
388
Frank Tang7e7574b2021-04-13 21:19:13 -0700389 if (op == u'-' || op == u'&') {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000390 patLocal.append(op);
391 }
392
393 if (nested == 0) {
394 // lazy allocation
395 if (!scratch.allocate()) {
396 ec = U_MEMORY_ALLOCATION_ERROR;
397 return;
398 }
399 nested = scratch.pointer();
400 }
401 switch (setMode) {
402 case 1:
Jungshik Shinaff99f52018-04-11 17:29:08 -0700403 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000404 break;
405 case 2:
406 chars.skipIgnored(opts);
407 nested->applyPropertyPattern(chars, patLocal, ec);
408 if (U_FAILURE(ec)) return;
409 break;
410 case 3: // `nested' already parsed
Frank Tang1f164ee2022-11-08 12:31:27 -0800411 nested->_toPattern(patLocal, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000412 break;
413 }
414
Frank Tang1f164ee2022-11-08 12:31:27 -0800415 usePat = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000416
417 if (mode == 0) {
418 // Entire pattern is a category; leave parse loop
419 *this = *nested;
420 mode = 2;
421 break;
422 }
423
424 switch (op) {
Frank Tang7e7574b2021-04-13 21:19:13 -0700425 case u'-':
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000426 removeAll(*nested);
427 break;
Frank Tang7e7574b2021-04-13 21:19:13 -0700428 case u'&':
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000429 retainAll(*nested);
430 break;
431 case 0:
432 addAll(*nested);
433 break;
434 }
435
436 op = 0;
437 lastItem = 2;
438
439 continue;
440 }
441
442 if (mode == 0) {
443 // syntaxError(chars, "Missing '['");
444 ec = U_MALFORMED_SET;
445 return;
446 }
447
448 // -------- Parse special (syntax) characters. If the
449 // current character is not special, or if it is escaped,
450 // then fall through and handle it below.
451
452 if (!literal) {
453 switch (c) {
Frank Tang7e7574b2021-04-13 21:19:13 -0700454 case u']':
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000455 if (lastItem == 1) {
456 add(lastChar, lastChar);
Frank Tang1f164ee2022-11-08 12:31:27 -0800457 _appendToPat(patLocal, lastChar, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000458 }
459 // Treat final trailing '-' as a literal
Frank Tang7e7574b2021-04-13 21:19:13 -0700460 if (op == u'-') {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000461 add(op, op);
462 patLocal.append(op);
Frank Tang7e7574b2021-04-13 21:19:13 -0700463 } else if (op == u'&') {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000464 // syntaxError(chars, "Trailing '&'");
465 ec = U_MALFORMED_SET;
466 return;
467 }
Frank Tang7e7574b2021-04-13 21:19:13 -0700468 patLocal.append(u']');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000469 mode = 2;
470 continue;
Frank Tang7e7574b2021-04-13 21:19:13 -0700471 case u'-':
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000472 if (op == 0) {
473 if (lastItem != 0) {
474 op = (UChar) c;
475 continue;
476 } else {
477 // Treat final trailing '-' as a literal
478 add(c, c);
479 c = chars.next(opts, literal, ec);
480 if (U_FAILURE(ec)) return;
Frank Tang7e7574b2021-04-13 21:19:13 -0700481 if (c == u']' && !literal) {
482 patLocal.append(u"-]", 2);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000483 mode = 2;
484 continue;
485 }
486 }
487 }
488 // syntaxError(chars, "'-' not after char or set");
489 ec = U_MALFORMED_SET;
490 return;
Frank Tang7e7574b2021-04-13 21:19:13 -0700491 case u'&':
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000492 if (lastItem == 2 && op == 0) {
493 op = (UChar) c;
494 continue;
495 }
496 // syntaxError(chars, "'&' not after set");
497 ec = U_MALFORMED_SET;
498 return;
Frank Tang7e7574b2021-04-13 21:19:13 -0700499 case u'^':
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000500 // syntaxError(chars, "'^' not after '['");
501 ec = U_MALFORMED_SET;
502 return;
Frank Tang7e7574b2021-04-13 21:19:13 -0700503 case u'{':
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000504 if (op != 0) {
505 // syntaxError(chars, "Missing operand after operator");
506 ec = U_MALFORMED_SET;
507 return;
508 }
509 if (lastItem == 1) {
510 add(lastChar, lastChar);
Frank Tang1f164ee2022-11-08 12:31:27 -0800511 _appendToPat(patLocal, lastChar, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000512 }
513 lastItem = 0;
514 buf.truncate(0);
515 {
Frank Tang1f164ee2022-11-08 12:31:27 -0800516 UBool ok = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000517 while (!chars.atEnd()) {
518 c = chars.next(opts, literal, ec);
519 if (U_FAILURE(ec)) return;
Frank Tang7e7574b2021-04-13 21:19:13 -0700520 if (c == u'}' && !literal) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800521 ok = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000522 break;
523 }
524 buf.append(c);
525 }
Frank Tang7e7574b2021-04-13 21:19:13 -0700526 if (!ok) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000527 // syntaxError(chars, "Invalid multicharacter string");
528 ec = U_MALFORMED_SET;
529 return;
530 }
531 }
532 // We have new string. Add it to set and continue;
533 // we don't need to drop through to the further
534 // processing
535 add(buf);
Frank Tang7e7574b2021-04-13 21:19:13 -0700536 patLocal.append(u'{');
Frank Tang1f164ee2022-11-08 12:31:27 -0800537 _appendToPat(patLocal, buf, false);
Frank Tang7e7574b2021-04-13 21:19:13 -0700538 patLocal.append(u'}');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000539 continue;
540 case SymbolTable::SYMBOL_REF:
541 // symbols nosymbols
542 // [a-$] error error (ambiguous)
543 // [a$] anchor anchor
544 // [a-$x] var "x"* literal '$'
545 // [a-$.] error literal '$'
546 // *We won't get here in the case of var "x"
547 {
548 chars.getPos(backup);
549 c = chars.next(opts, literal, ec);
550 if (U_FAILURE(ec)) return;
Frank Tang7e7574b2021-04-13 21:19:13 -0700551 UBool anchor = (c == u']' && !literal);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000552 if (symbols == 0 && !anchor) {
553 c = SymbolTable::SYMBOL_REF;
554 chars.setPos(backup);
555 break; // literal '$'
556 }
557 if (anchor && op == 0) {
558 if (lastItem == 1) {
559 add(lastChar, lastChar);
Frank Tang1f164ee2022-11-08 12:31:27 -0800560 _appendToPat(patLocal, lastChar, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000561 }
562 add(U_ETHER);
Frank Tang1f164ee2022-11-08 12:31:27 -0800563 usePat = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000564 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
Frank Tang7e7574b2021-04-13 21:19:13 -0700565 patLocal.append(u']');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000566 mode = 2;
567 continue;
568 }
569 // syntaxError(chars, "Unquoted '$'");
570 ec = U_MALFORMED_SET;
571 return;
572 }
573 default:
574 break;
575 }
576 }
577
578 // -------- Parse literal characters. This includes both
579 // escaped chars ("\u4E01") and non-syntax characters
580 // ("a").
581
582 switch (lastItem) {
583 case 0:
584 lastItem = 1;
585 lastChar = c;
586 break;
587 case 1:
Frank Tang7e7574b2021-04-13 21:19:13 -0700588 if (op == u'-') {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000589 if (lastChar >= c) {
590 // Don't allow redundant (a-a) or empty (b-a) ranges;
591 // these are most likely typos.
592 // syntaxError(chars, "Invalid range");
593 ec = U_MALFORMED_SET;
594 return;
595 }
596 add(lastChar, c);
Frank Tang1f164ee2022-11-08 12:31:27 -0800597 _appendToPat(patLocal, lastChar, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000598 patLocal.append(op);
Frank Tang1f164ee2022-11-08 12:31:27 -0800599 _appendToPat(patLocal, c, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000600 lastItem = 0;
601 op = 0;
602 } else {
603 add(lastChar, lastChar);
Frank Tang1f164ee2022-11-08 12:31:27 -0800604 _appendToPat(patLocal, lastChar, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000605 lastChar = c;
606 }
607 break;
608 case 2:
609 if (op != 0) {
610 // syntaxError(chars, "Set expected after operator");
611 ec = U_MALFORMED_SET;
612 return;
613 }
614 lastChar = c;
615 lastItem = 1;
616 break;
617 }
618 }
619
620 if (mode != 2) {
621 // syntaxError(chars, "Missing ']'");
622 ec = U_MALFORMED_SET;
623 return;
624 }
625
626 chars.skipIgnored(opts);
627
628 /**
629 * Handle global flags (invert, case insensitivity). If this
630 * pattern should be compiled case-insensitive, then we need
631 * to close over case BEFORE COMPLEMENTING. This makes
632 * patterns like /[^abc]/i work.
633 */
634 if ((options & USET_CASE_INSENSITIVE) != 0) {
635 (this->*caseClosure)(USET_CASE_INSENSITIVE);
636 }
637 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
638 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
639 }
640 if (invert) {
Frank Tang3e05d9d2021-11-08 14:04:04 -0800641 complement().removeAllStrings(); // code point complement
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000642 }
643
644 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
645 // generated pattern.
646 if (usePat) {
647 rebuiltPat.append(patLocal);
648 } else {
Frank Tang1f164ee2022-11-08 12:31:27 -0800649 _generatePattern(rebuiltPat, false);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000650 }
651 if (isBogus() && U_SUCCESS(ec)) {
652 // We likely ran out of memory. AHHH!
653 ec = U_MEMORY_ALLOCATION_ERROR;
654 }
655}
656
657//----------------------------------------------------------------
658// Property set implementation
659//----------------------------------------------------------------
660
Jungshik Shinaff99f52018-04-11 17:29:08 -0700661namespace {
662
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000663static UBool numericValueFilter(UChar32 ch, void* context) {
664 return u_getNumericValue(ch) == *(double*)context;
665}
666
Jungshik Shind13a96f2018-11-14 09:22:09 -0800667static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
668 int32_t value = *(int32_t*)context;
669 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
670}
671
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000672static UBool versionFilter(UChar32 ch, void* context) {
673 static const UVersionInfo none = { 0, 0, 0, 0 };
674 UVersionInfo v;
675 u_charAge(ch, v);
676 UVersionInfo* version = (UVersionInfo*)context;
677 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
678}
679
Jungshik Shind13a96f2018-11-14 09:22:09 -0800680typedef struct {
681 UProperty prop;
682 int32_t value;
683} IntPropertyContext;
684
685static UBool intPropertyFilter(UChar32 ch, void* context) {
686 IntPropertyContext* c = (IntPropertyContext*)context;
687 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
688}
689
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000690static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
691 return uscript_hasScript(ch, *(UScriptCode*)context);
692}
693
Jungshik Shinaff99f52018-04-11 17:29:08 -0700694} // namespace
695
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000696/**
697 * Generic filter-based scanning code for UCD property UnicodeSets.
698 */
699void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
700 void* context,
Jungshik Shin42d50272018-10-24 01:22:09 -0700701 const UnicodeSet* inclusions,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000702 UErrorCode &status) {
703 if (U_FAILURE(status)) return;
704
705 // Logically, walk through all Unicode characters, noting the start
706 // and end of each range for which filter.contain(c) is
707 // true. Add each range to a set.
708 //
709 // To improve performance, use an inclusions set which
710 // encodes information about character ranges that are known
711 // to have identical properties.
Jungshik Shin42d50272018-10-24 01:22:09 -0700712 // inclusions contains the first characters of
713 // same-value ranges for the given property.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000714
715 clear();
716
717 UChar32 startHasProperty = -1;
718 int32_t limitRange = inclusions->getRangeCount();
719
720 for (int j=0; j<limitRange; ++j) {
721 // get current range
722 UChar32 start = inclusions->getRangeStart(j);
723 UChar32 end = inclusions->getRangeEnd(j);
724
725 // for all the code points in the range, process
726 for (UChar32 ch = start; ch <= end; ++ch) {
727 // only add to this UnicodeSet on inflection points --
728 // where the hasProperty value changes to false
729 if ((*filter)(ch, context)) {
730 if (startHasProperty < 0) {
731 startHasProperty = ch;
732 }
733 } else if (startHasProperty >= 0) {
734 add(startHasProperty, ch-1);
735 startHasProperty = -1;
736 }
737 }
738 }
739 if (startHasProperty >= 0) {
740 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
741 }
742 if (isBogus() && U_SUCCESS(status)) {
743 // We likely ran out of memory. AHHH!
744 status = U_MEMORY_ALLOCATION_ERROR;
745 }
746}
747
Jungshik Shinaff99f52018-04-11 17:29:08 -0700748namespace {
749
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000750static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
751 /* Note: we use ' ' in compiler code page */
752 int32_t j = 0;
753 char ch;
754 --dstCapacity; /* make room for term. zero */
755 while ((ch = *src++) != 0) {
756 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
757 continue;
758 }
Frank Tang1f164ee2022-11-08 12:31:27 -0800759 if (j >= dstCapacity) return false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000760 dst[j++] = ch;
761 }
762 if (j > 0 && dst[j-1] == ' ') --j;
763 dst[j] = 0;
Frank Tang1f164ee2022-11-08 12:31:27 -0800764 return true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000765}
766
Jungshik Shinaff99f52018-04-11 17:29:08 -0700767} // namespace
768
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000769//----------------------------------------------------------------
770// Property set API
771//----------------------------------------------------------------
772
Frank Tangb8696612019-10-25 14:58:21 -0700773#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
774 ec=U_ILLEGAL_ARGUMENT_ERROR; \
775 return *this; \
776} UPRV_BLOCK_MACRO_END
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000777
778UnicodeSet&
779UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
Jungshik Shind13a96f2018-11-14 09:22:09 -0800780 if (U_FAILURE(ec) || isFrozen()) { return *this; }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000781 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
Jungshik Shind13a96f2018-11-14 09:22:09 -0800782 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
783 applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000784 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
Jungshik Shin42d50272018-10-24 01:22:09 -0700785 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000786 UScriptCode script = (UScriptCode)value;
Jungshik Shin42d50272018-10-24 01:22:09 -0700787 applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
788 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
789 if (value == 0 || value == 1) {
790 const USet *set = u_getBinaryPropertySet(prop, &ec);
791 if (U_FAILURE(ec)) { return *this; }
Frank Tang1f164ee2022-11-08 12:31:27 -0800792 copyFrom(*UnicodeSet::fromUSet(set), true);
Jungshik Shin42d50272018-10-24 01:22:09 -0700793 if (value == 0) {
Frank Tang3e05d9d2021-11-08 14:04:04 -0800794 complement().removeAllStrings(); // code point complement
Jungshik Shin42d50272018-10-24 01:22:09 -0700795 }
796 } else {
797 clear();
798 }
799 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
Jungshik Shind13a96f2018-11-14 09:22:09 -0800800 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
801 IntPropertyContext c = {prop, value};
802 applyFilter(intPropertyFilter, &c, inclusions, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000803 } else {
Jungshik Shin42d50272018-10-24 01:22:09 -0700804 ec = U_ILLEGAL_ARGUMENT_ERROR;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000805 }
806 return *this;
807}
808
809UnicodeSet&
810UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
811 const UnicodeString& value,
812 UErrorCode& ec) {
813 if (U_FAILURE(ec) || isFrozen()) return *this;
814
815 // prop and value used to be converted to char * using the default
816 // converter instead of the invariant conversion.
817 // This should not be necessary because all Unicode property and value
818 // names use only invariant characters.
819 // If there are any variant characters, then we won't find them anyway.
820 // Checking first avoids assertion failures in the conversion.
821 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
822 !uprv_isInvariantUString(value.getBuffer(), value.length())
823 ) {
824 FAIL(ec);
825 }
826 CharString pname, vname;
827 pname.appendInvariantChars(prop, ec);
828 vname.appendInvariantChars(value, ec);
829 if (U_FAILURE(ec)) return *this;
830
831 UProperty p;
832 int32_t v;
Frank Tang1f164ee2022-11-08 12:31:27 -0800833 UBool invert = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000834
835 if (value.length() > 0) {
836 p = u_getPropertyEnum(pname.data());
837 if (p == UCHAR_INVALID_CODE) FAIL(ec);
838
839 // Treat gc as gcm
840 if (p == UCHAR_GENERAL_CATEGORY) {
841 p = UCHAR_GENERAL_CATEGORY_MASK;
842 }
843
844 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
845 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
846 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
847 v = u_getPropertyValueEnum(p, vname.data());
848 if (v == UCHAR_INVALID_CODE) {
849 // Handle numeric CCC
850 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
851 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
852 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
853 char* end;
Jungshik Shin42d50272018-10-24 01:22:09 -0700854 double val = uprv_strtod(vname.data(), &end);
Jungshik Shinb3189662017-11-07 11:18:34 -0800855 // Anything between 0 and 255 is valid even if unused.
856 // Cast double->int only after range check.
857 // We catch NaN here because comparing it with both 0 and 255 will be false
858 // (as are all comparisons with NaN).
Jungshik Shin42d50272018-10-24 01:22:09 -0700859 if (*end != 0 || !(0 <= val && val <= 255) ||
860 (v = (int32_t)val) != val) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800861 // non-integral value or outside 0..255, or trailing junk
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000862 FAIL(ec);
863 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000864 } else {
865 FAIL(ec);
866 }
867 }
868 }
869
870 else {
871
872 switch (p) {
873 case UCHAR_NUMERIC_VALUE:
874 {
875 char* end;
Jungshik Shin42d50272018-10-24 01:22:09 -0700876 double val = uprv_strtod(vname.data(), &end);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000877 if (*end != 0) {
878 FAIL(ec);
879 }
Jungshik Shin42d50272018-10-24 01:22:09 -0700880 applyFilter(numericValueFilter, &val,
881 CharacterProperties::getInclusionsForProperty(p, ec), ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000882 return *this;
883 }
884 case UCHAR_NAME:
885 {
886 // Must munge name, since u_charFromName() does not do
887 // 'loose' matching.
888 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
889 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
890 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
891 if (U_SUCCESS(ec)) {
892 clear();
893 add(ch);
894 return *this;
895 } else {
896 FAIL(ec);
897 }
898 }
899 case UCHAR_UNICODE_1_NAME:
900 // ICU 49 deprecates the Unicode_1_Name property APIs.
901 FAIL(ec);
902 case UCHAR_AGE:
903 {
904 // Must munge name, since u_versionFromString() does not do
905 // 'loose' matching.
906 char buf[128];
907 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
908 UVersionInfo version;
909 u_versionFromString(version, buf);
Jungshik Shin42d50272018-10-24 01:22:09 -0700910 applyFilter(versionFilter, &version,
911 CharacterProperties::getInclusionsForProperty(p, ec), ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000912 return *this;
913 }
914 case UCHAR_SCRIPT_EXTENSIONS:
915 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
916 if (v == UCHAR_INVALID_CODE) {
917 FAIL(ec);
918 }
919 // fall through to calling applyIntPropertyValue()
920 break;
921 default:
922 // p is a non-binary, non-enumerated property that we
923 // don't support (yet).
924 FAIL(ec);
925 }
926 }
927 }
928
929 else {
930 // value is empty. Interpret as General Category, Script, or
931 // Binary property.
932 p = UCHAR_GENERAL_CATEGORY_MASK;
933 v = u_getPropertyValueEnum(p, pname.data());
934 if (v == UCHAR_INVALID_CODE) {
935 p = UCHAR_SCRIPT;
936 v = u_getPropertyValueEnum(p, pname.data());
937 if (v == UCHAR_INVALID_CODE) {
938 p = u_getPropertyEnum(pname.data());
939 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
940 v = 1;
941 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
942 set(MIN_VALUE, MAX_VALUE);
943 return *this;
944 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
945 set(0, 0x7F);
946 return *this;
947 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
948 // [:Assigned:]=[:^Cn:]
949 p = UCHAR_GENERAL_CATEGORY_MASK;
950 v = U_GC_CN_MASK;
Frank Tang1f164ee2022-11-08 12:31:27 -0800951 invert = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000952 } else {
953 FAIL(ec);
954 }
955 }
956 }
957 }
958
959 applyIntPropertyValue(p, v, ec);
960 if(invert) {
Frank Tang3e05d9d2021-11-08 14:04:04 -0800961 complement().removeAllStrings(); // code point complement
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000962 }
963
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000964 if (isBogus() && U_SUCCESS(ec)) {
965 // We likely ran out of memory. AHHH!
966 ec = U_MEMORY_ALLOCATION_ERROR;
967 }
968 return *this;
969}
970
971//----------------------------------------------------------------
972// Property set patterns
973//----------------------------------------------------------------
974
975/**
976 * Return true if the given position, in the given pattern, appears
977 * to be the start of a property set pattern.
978 */
979UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
980 int32_t pos) {
981 // Patterns are at least 5 characters long
982 if ((pos+5) > pattern.length()) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800983 return false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000984 }
985
986 // Look for an opening [:, [:^, \p, or \P
987 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
988}
989
990/**
991 * Return true if the given iterator appears to point at a
992 * property pattern. Regardless of the result, return with the
993 * iterator unchanged.
994 * @param chars iterator over the pattern characters. Upon return
995 * it will be unchanged.
996 * @param iterOpts RuleCharacterIterator options
997 */
998UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
999 int32_t iterOpts) {
Frank Tang1f164ee2022-11-08 12:31:27 -08001000 // NOTE: literal will always be false, because we don't parse escapes.
1001 UBool result = false, literal;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001002 UErrorCode ec = U_ZERO_ERROR;
1003 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1004 RuleCharacterIterator::Pos pos;
1005 chars.getPos(pos);
1006 UChar32 c = chars.next(iterOpts, literal, ec);
Frank Tang7e7574b2021-04-13 21:19:13 -07001007 if (c == u'[' || c == u'\\') {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001008 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1009 literal, ec);
Frank Tang7e7574b2021-04-13 21:19:13 -07001010 result = (c == u'[') ? (d == u':') :
1011 (d == u'N' || d == u'p' || d == u'P');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001012 }
1013 chars.setPos(pos);
1014 return result && U_SUCCESS(ec);
1015}
1016
1017/**
1018 * Parse the given property pattern at the given parse position.
1019 */
1020UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1021 ParsePosition& ppos,
1022 UErrorCode &ec) {
1023 int32_t pos = ppos.getIndex();
1024
Frank Tang1f164ee2022-11-08 12:31:27 -08001025 UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1026 UBool isName = false; // true for \N{pat}, o/w false
1027 UBool invert = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001028
1029 if (U_FAILURE(ec)) return *this;
1030
1031 // Minimum length is 5 characters, e.g. \p{L}
1032 if ((pos+5) > pattern.length()) {
1033 FAIL(ec);
1034 }
1035
1036 // On entry, ppos should point to one of the following locations:
1037 // Look for an opening [:, [:^, \p, or \P
1038 if (isPOSIXOpen(pattern, pos)) {
Frank Tang1f164ee2022-11-08 12:31:27 -08001039 posix = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001040 pos += 2;
1041 pos = ICU_Utility::skipWhitespace(pattern, pos);
Frank Tang7e7574b2021-04-13 21:19:13 -07001042 if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001043 ++pos;
Frank Tang1f164ee2022-11-08 12:31:27 -08001044 invert = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001045 }
1046 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1047 UChar c = pattern.charAt(pos+1);
Frank Tang7e7574b2021-04-13 21:19:13 -07001048 invert = (c == u'P');
1049 isName = (c == u'N');
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001050 pos += 2;
1051 pos = ICU_Utility::skipWhitespace(pattern, pos);
Frank Tang7e7574b2021-04-13 21:19:13 -07001052 if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001053 // Syntax error; "\p" or "\P" not followed by "{"
1054 FAIL(ec);
1055 }
1056 } else {
1057 // Open delimiter not seen
1058 FAIL(ec);
1059 }
1060
1061 // Look for the matching close delimiter, either :] or }
1062 int32_t close;
1063 if (posix) {
Frank Tang7e7574b2021-04-13 21:19:13 -07001064 close = pattern.indexOf(u":]", 2, pos);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001065 } else {
Frank Tang7e7574b2021-04-13 21:19:13 -07001066 close = pattern.indexOf(u'}', pos);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001067 }
1068 if (close < 0) {
1069 // Syntax error; close delimiter missing
1070 FAIL(ec);
1071 }
1072
1073 // Look for an '=' sign. If this is present, we will parse a
1074 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1075 // pattern.
Frank Tang7e7574b2021-04-13 21:19:13 -07001076 int32_t equals = pattern.indexOf(u'=', pos);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001077 UnicodeString propName, valueName;
1078 if (equals >= 0 && equals < close && !isName) {
1079 // Equals seen; parse medium/long pattern
1080 pattern.extractBetween(pos, equals, propName);
1081 pattern.extractBetween(equals+1, close, valueName);
1082 }
1083
1084 else {
1085 // Handle case where no '=' is seen, and \N{}
1086 pattern.extractBetween(pos, close, propName);
1087
1088 // Handle \N{name}
1089 if (isName) {
1090 // This is a little inefficient since it means we have to
1091 // parse NAME_PROP back to UCHAR_NAME even though we already
1092 // know it's UCHAR_NAME. If we refactor the API to
1093 // support args of (UProperty, char*) then we can remove
1094 // NAME_PROP and make this a little more efficient.
1095 valueName = propName;
1096 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1097 }
1098 }
1099
1100 applyPropertyAlias(propName, valueName, ec);
1101
1102 if (U_SUCCESS(ec)) {
1103 if (invert) {
Frank Tang3e05d9d2021-11-08 14:04:04 -08001104 complement().removeAllStrings(); // code point complement
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001105 }
Frank Tang3e05d9d2021-11-08 14:04:04 -08001106
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001107 // Move to the limit position after the close delimiter if the
1108 // parse succeeded.
1109 ppos.setIndex(close + (posix ? 2 : 1));
1110 }
1111
1112 return *this;
1113}
1114
1115/**
1116 * Parse a property pattern.
1117 * @param chars iterator over the pattern characters. Upon return
1118 * it will be advanced to the first character after the parsed
1119 * pattern, or the end of the iteration if all characters are
1120 * parsed.
1121 * @param rebuiltPat the pattern that was parsed, rebuilt or
1122 * copied from the input pattern, as appropriate.
1123 */
1124void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1125 UnicodeString& rebuiltPat,
1126 UErrorCode& ec) {
1127 if (U_FAILURE(ec)) return;
1128 UnicodeString pattern;
1129 chars.lookahead(pattern);
1130 ParsePosition pos(0);
1131 applyPropertyPattern(pattern, pos, ec);
1132 if (U_FAILURE(ec)) return;
1133 if (pos.getIndex() == 0) {
1134 // syntaxError(chars, "Invalid property pattern");
1135 ec = U_MALFORMED_SET;
1136 return;
1137 }
1138 chars.jumpahead(pos.getIndex());
1139 rebuiltPat.append(pattern, 0, pos.getIndex());
1140}
1141
1142U_NAMESPACE_END