blob: 6cfd80a705b8fcb4911b94ab29c44d6c315e7653 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4*******************************************************************************
5*
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08006* Copyright (C) 1999-2014, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uniset_props.cpp
Jungshik Shin87232d82017-05-13 21:10:13 -070011* encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000012* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug25
16* created by: Markus W. Scherer
17*
18* Character property dependent functions moved here from uniset.cpp
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/uniset.h"
23#include "unicode/parsepos.h"
24#include "unicode/uchar.h"
25#include "unicode/uscript.h"
26#include "unicode/symtable.h"
27#include "unicode/uset.h"
28#include "unicode/locid.h"
29#include "unicode/brkiter.h"
30#include "uset_imp.h"
31#include "ruleiter.h"
32#include "cmemory.h"
33#include "ucln_cmn.h"
34#include "util.h"
35#include "uvector.h"
36#include "uprops.h"
37#include "propname.h"
38#include "normalizer2impl.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000039#include "uinvchar.h"
40#include "uprops.h"
41#include "charstr.h"
42#include "cstring.h"
43#include "mutex.h"
44#include "umutex.h"
45#include "uassert.h"
46#include "hash.h"
47
48U_NAMESPACE_USE
49
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000050// initial storage. Must be >= 0
51// *** same as in uniset.cpp ! ***
52#define START_EXTRA 16
53
54// Define UChar constants using hex for EBCDIC compatibility
55// Used #define to reduce private static exports and memory access time.
56#define SET_OPEN ((UChar)0x005B) /*[*/
57#define SET_CLOSE ((UChar)0x005D) /*]*/
58#define HYPHEN ((UChar)0x002D) /*-*/
59#define COMPLEMENT ((UChar)0x005E) /*^*/
60#define COLON ((UChar)0x003A) /*:*/
61#define BACKSLASH ((UChar)0x005C) /*\*/
62#define INTERSECTION ((UChar)0x0026) /*&*/
63#define UPPER_U ((UChar)0x0055) /*U*/
64#define LOWER_U ((UChar)0x0075) /*u*/
65#define OPEN_BRACE ((UChar)123) /*{*/
66#define CLOSE_BRACE ((UChar)125) /*}*/
67#define UPPER_P ((UChar)0x0050) /*P*/
68#define LOWER_P ((UChar)0x0070) /*p*/
69#define UPPER_N ((UChar)78) /*N*/
70#define EQUALS ((UChar)0x003D) /*=*/
71
72//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
73static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
74//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
75//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
76//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
77static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
78
79// Special property set IDs
80static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
81static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
82static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
83
84// Unicode name property alias
85#define NAME_PROP "na"
86#define NAME_PROP_LENGTH 2
87
88/**
89 * Delimiter string used in patterns to close a category reference:
90 * ":]". Example: "[:Lu:]".
91 */
92//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
93
94// Cached sets ------------------------------------------------------------- ***
95
96U_CDECL_BEGIN
97static UBool U_CALLCONV uset_cleanup();
98
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000099static UnicodeSet *uni32Singleton;
100static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
101
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000102/**
103 * Cleanup function for UnicodeSet
104 */
105static UBool U_CALLCONV uset_cleanup(void) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000106 delete uni32Singleton;
107 uni32Singleton = NULL;
108 uni32InitOnce.reset();
109 return TRUE;
110}
111
112U_CDECL_END
113
114U_NAMESPACE_BEGIN
115
Jungshik Shinaff99f52018-04-11 17:29:08 -0700116namespace {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000117
118// Cache some sets for other services -------------------------------------- ***
119void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
120 U_ASSERT(uni32Singleton == NULL);
121 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
122 if(uni32Singleton==NULL) {
123 errorCode=U_MEMORY_ALLOCATION_ERROR;
124 } else {
125 uni32Singleton->freeze();
126 }
127 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
128}
129
130
131U_CFUNC UnicodeSet *
132uniset_getUnicode32Instance(UErrorCode &errorCode) {
133 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
134 return uni32Singleton;
135}
136
137// helper functions for matching of pattern syntax pieces ------------------ ***
138// these functions are parallel to the PERL_OPEN etc. strings above
139
140// using these functions is not only faster than UnicodeString::compare() and
141// caseCompare(), but they also make UnicodeSet work for simple patterns when
142// no Unicode properties data is available - when caseCompare() fails
143
144static inline UBool
145isPerlOpen(const UnicodeString &pattern, int32_t pos) {
146 UChar c;
147 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
148}
149
150/*static inline UBool
151isPerlClose(const UnicodeString &pattern, int32_t pos) {
152 return pattern.charAt(pos)==CLOSE_BRACE;
153}*/
154
155static inline UBool
156isNameOpen(const UnicodeString &pattern, int32_t pos) {
157 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
158}
159
160static inline UBool
161isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
162 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
163}
164
165/*static inline UBool
166isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
167 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
168}*/
169
170// TODO memory debugging provided inside uniset.cpp
171// could be made available here but probably obsolete with use of modern
172// memory leak checker tools
173#define _dbgct(me)
174
Jungshik Shinaff99f52018-04-11 17:29:08 -0700175} // namespace
176
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000177//----------------------------------------------------------------
178// Constructors &c
179//----------------------------------------------------------------
180
181/**
182 * Constructs a set from the given pattern, optionally ignoring
183 * white space. See the class description for the syntax of the
184 * pattern language.
185 * @param pattern a string specifying what characters are in the set
186 */
187UnicodeSet::UnicodeSet(const UnicodeString& pattern,
188 UErrorCode& status) :
189 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
190 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
191 fFlags(0)
192{
193 if(U_SUCCESS(status)){
194 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
195 /* test for NULL */
196 if(list == NULL) {
197 status = U_MEMORY_ALLOCATION_ERROR;
198 }else{
199 allocateStrings(status);
200 applyPattern(pattern, status);
201 }
202 }
203 _dbgct(this);
204}
205
206//----------------------------------------------------------------
207// Public API
208//----------------------------------------------------------------
209
210UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
211 UErrorCode& status) {
212 // Equivalent to
213 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
214 // but without dependency on closeOver().
215 ParsePosition pos(0);
216 applyPatternIgnoreSpace(pattern, pos, NULL, status);
217 if (U_FAILURE(status)) return *this;
218
219 int32_t i = pos.getIndex();
220 // Skip over trailing whitespace
221 ICU_Utility::skipWhitespace(pattern, i, TRUE);
222 if (i != pattern.length()) {
223 status = U_ILLEGAL_ARGUMENT_ERROR;
224 }
225 return *this;
226}
227
228void
229UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
230 ParsePosition& pos,
231 const SymbolTable* symbols,
232 UErrorCode& status) {
233 if (U_FAILURE(status)) {
234 return;
235 }
236 if (isFrozen()) {
237 status = U_NO_WRITE_PERMISSION;
238 return;
239 }
240 // Need to build the pattern in a temporary string because
241 // _applyPattern calls add() etc., which set pat to empty.
242 UnicodeString rebuiltPat;
243 RuleCharacterIterator chars(pattern, symbols, pos);
Jungshik Shinaff99f52018-04-11 17:29:08 -0700244 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000245 if (U_FAILURE(status)) return;
246 if (chars.inVariable()) {
247 // syntaxError(chars, "Extra chars in variable value");
248 status = U_MALFORMED_SET;
249 return;
250 }
251 setPattern(rebuiltPat);
252}
253
254/**
255 * Return true if the given position, in the given pattern, appears
256 * to be the start of a UnicodeSet pattern.
257 */
258UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
259 return ((pos+1) < pattern.length() &&
260 pattern.charAt(pos) == (UChar)91/*[*/) ||
261 resemblesPropertyPattern(pattern, pos);
262}
263
264//----------------------------------------------------------------
265// Implementation: Pattern parsing
266//----------------------------------------------------------------
267
Jungshik Shinaff99f52018-04-11 17:29:08 -0700268namespace {
269
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000270/**
271 * A small all-inline class to manage a UnicodeSet pointer. Add
272 * operator->() etc. as needed.
273 */
274class UnicodeSetPointer {
275 UnicodeSet* p;
276public:
277 inline UnicodeSetPointer() : p(0) {}
278 inline ~UnicodeSetPointer() { delete p; }
279 inline UnicodeSet* pointer() { return p; }
280 inline UBool allocate() {
281 if (p == 0) {
282 p = new UnicodeSet();
283 }
284 return p != 0;
285 }
286};
287
Jungshik Shinaff99f52018-04-11 17:29:08 -0700288constexpr int32_t MAX_DEPTH = 100;
289
290} // namespace
291
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000292/**
293 * Parse the pattern from the given RuleCharacterIterator. The
294 * iterator is advanced over the parsed pattern.
295 * @param chars iterator over the pattern characters. Upon return
296 * it will be advanced to the first character after the parsed
297 * pattern, or the end of the iteration if all characters are
298 * parsed.
299 * @param symbols symbol table to use to parse and dereference
300 * variables, or null if none.
301 * @param rebuiltPat the pattern that was parsed, rebuilt or
302 * copied from the input pattern, as appropriate.
303 * @param options a bit mask of zero or more of the following:
304 * IGNORE_SPACE, CASE.
305 */
306void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
307 const SymbolTable* symbols,
308 UnicodeString& rebuiltPat,
309 uint32_t options,
310 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
Jungshik Shinaff99f52018-04-11 17:29:08 -0700311 int32_t depth,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000312 UErrorCode& ec) {
313 if (U_FAILURE(ec)) return;
Jungshik Shinaff99f52018-04-11 17:29:08 -0700314 if (depth > MAX_DEPTH) {
315 ec = U_ILLEGAL_ARGUMENT_ERROR;
316 return;
317 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000318
319 // Syntax characters: [ ] ^ - & { }
320
321 // Recognized special forms for chars, sets: c-c s-s s&s
322
323 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
324 RuleCharacterIterator::PARSE_ESCAPES;
325 if ((options & USET_IGNORE_SPACE) != 0) {
326 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
327 }
328
329 UnicodeString patLocal, buf;
330 UBool usePat = FALSE;
331 UnicodeSetPointer scratch;
332 RuleCharacterIterator::Pos backup;
333
334 // mode: 0=before [, 1=between [...], 2=after ]
335 // lastItem: 0=none, 1=char, 2=set
336 int8_t lastItem = 0, mode = 0;
337 UChar32 lastChar = 0;
338 UChar op = 0;
339
340 UBool invert = FALSE;
341
342 clear();
343
344 while (mode != 2 && !chars.atEnd()) {
345 U_ASSERT((lastItem == 0 && op == 0) ||
346 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
347 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
348 op == INTERSECTION /*'&'*/)));
349
350 UChar32 c = 0;
351 UBool literal = FALSE;
352 UnicodeSet* nested = 0; // alias - do not delete
353
354 // -------- Check for property pattern
355
356 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
357 int8_t setMode = 0;
358 if (resemblesPropertyPattern(chars, opts)) {
359 setMode = 2;
360 }
361
362 // -------- Parse '[' of opening delimiter OR nested set.
363 // If there is a nested set, use `setMode' to define how
364 // the set should be parsed. If the '[' is part of the
365 // opening delimiter for this pattern, parse special
366 // strings "[", "[^", "[-", and "[^-". Check for stand-in
367 // characters representing a nested set in the symbol
368 // table.
369
370 else {
371 // Prepare to backup if necessary
372 chars.getPos(backup);
373 c = chars.next(opts, literal, ec);
374 if (U_FAILURE(ec)) return;
375
376 if (c == 0x5B /*'['*/ && !literal) {
377 if (mode == 1) {
378 chars.setPos(backup); // backup
379 setMode = 1;
380 } else {
381 // Handle opening '[' delimiter
382 mode = 1;
383 patLocal.append((UChar) 0x5B /*'['*/);
384 chars.getPos(backup); // prepare to backup
385 c = chars.next(opts, literal, ec);
386 if (U_FAILURE(ec)) return;
387 if (c == 0x5E /*'^'*/ && !literal) {
388 invert = TRUE;
389 patLocal.append((UChar) 0x5E /*'^'*/);
390 chars.getPos(backup); // prepare to backup
391 c = chars.next(opts, literal, ec);
392 if (U_FAILURE(ec)) return;
393 }
394 // Fall through to handle special leading '-';
395 // otherwise restart loop for nested [], \p{}, etc.
396 if (c == HYPHEN /*'-'*/) {
397 literal = TRUE;
398 // Fall through to handle literal '-' below
399 } else {
400 chars.setPos(backup); // backup
401 continue;
402 }
403 }
404 } else if (symbols != 0) {
405 const UnicodeFunctor *m = symbols->lookupMatcher(c);
406 if (m != 0) {
407 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
408 if (ms == NULL) {
409 ec = U_MALFORMED_SET;
410 return;
411 }
412 // casting away const, but `nested' won't be modified
413 // (important not to modify stored set)
414 nested = const_cast<UnicodeSet*>(ms);
415 setMode = 3;
416 }
417 }
418 }
419
420 // -------- Handle a nested set. This either is inline in
421 // the pattern or represented by a stand-in that has
422 // previously been parsed and was looked up in the symbol
423 // table.
424
425 if (setMode != 0) {
426 if (lastItem == 1) {
427 if (op != 0) {
428 // syntaxError(chars, "Char expected after operator");
429 ec = U_MALFORMED_SET;
430 return;
431 }
432 add(lastChar, lastChar);
433 _appendToPat(patLocal, lastChar, FALSE);
434 lastItem = 0;
435 op = 0;
436 }
437
438 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
439 patLocal.append(op);
440 }
441
442 if (nested == 0) {
443 // lazy allocation
444 if (!scratch.allocate()) {
445 ec = U_MEMORY_ALLOCATION_ERROR;
446 return;
447 }
448 nested = scratch.pointer();
449 }
450 switch (setMode) {
451 case 1:
Jungshik Shinaff99f52018-04-11 17:29:08 -0700452 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000453 break;
454 case 2:
455 chars.skipIgnored(opts);
456 nested->applyPropertyPattern(chars, patLocal, ec);
457 if (U_FAILURE(ec)) return;
458 break;
459 case 3: // `nested' already parsed
460 nested->_toPattern(patLocal, FALSE);
461 break;
462 }
463
464 usePat = TRUE;
465
466 if (mode == 0) {
467 // Entire pattern is a category; leave parse loop
468 *this = *nested;
469 mode = 2;
470 break;
471 }
472
473 switch (op) {
474 case HYPHEN: /*'-'*/
475 removeAll(*nested);
476 break;
477 case INTERSECTION: /*'&'*/
478 retainAll(*nested);
479 break;
480 case 0:
481 addAll(*nested);
482 break;
483 }
484
485 op = 0;
486 lastItem = 2;
487
488 continue;
489 }
490
491 if (mode == 0) {
492 // syntaxError(chars, "Missing '['");
493 ec = U_MALFORMED_SET;
494 return;
495 }
496
497 // -------- Parse special (syntax) characters. If the
498 // current character is not special, or if it is escaped,
499 // then fall through and handle it below.
500
501 if (!literal) {
502 switch (c) {
503 case 0x5D /*']'*/:
504 if (lastItem == 1) {
505 add(lastChar, lastChar);
506 _appendToPat(patLocal, lastChar, FALSE);
507 }
508 // Treat final trailing '-' as a literal
509 if (op == HYPHEN /*'-'*/) {
510 add(op, op);
511 patLocal.append(op);
512 } else if (op == INTERSECTION /*'&'*/) {
513 // syntaxError(chars, "Trailing '&'");
514 ec = U_MALFORMED_SET;
515 return;
516 }
517 patLocal.append((UChar) 0x5D /*']'*/);
518 mode = 2;
519 continue;
520 case HYPHEN /*'-'*/:
521 if (op == 0) {
522 if (lastItem != 0) {
523 op = (UChar) c;
524 continue;
525 } else {
526 // Treat final trailing '-' as a literal
527 add(c, c);
528 c = chars.next(opts, literal, ec);
529 if (U_FAILURE(ec)) return;
530 if (c == 0x5D /*']'*/ && !literal) {
531 patLocal.append(HYPHEN_RIGHT_BRACE, 2);
532 mode = 2;
533 continue;
534 }
535 }
536 }
537 // syntaxError(chars, "'-' not after char or set");
538 ec = U_MALFORMED_SET;
539 return;
540 case INTERSECTION /*'&'*/:
541 if (lastItem == 2 && op == 0) {
542 op = (UChar) c;
543 continue;
544 }
545 // syntaxError(chars, "'&' not after set");
546 ec = U_MALFORMED_SET;
547 return;
548 case 0x5E /*'^'*/:
549 // syntaxError(chars, "'^' not after '['");
550 ec = U_MALFORMED_SET;
551 return;
552 case 0x7B /*'{'*/:
553 if (op != 0) {
554 // syntaxError(chars, "Missing operand after operator");
555 ec = U_MALFORMED_SET;
556 return;
557 }
558 if (lastItem == 1) {
559 add(lastChar, lastChar);
560 _appendToPat(patLocal, lastChar, FALSE);
561 }
562 lastItem = 0;
563 buf.truncate(0);
564 {
565 UBool ok = FALSE;
566 while (!chars.atEnd()) {
567 c = chars.next(opts, literal, ec);
568 if (U_FAILURE(ec)) return;
569 if (c == 0x7D /*'}'*/ && !literal) {
570 ok = TRUE;
571 break;
572 }
573 buf.append(c);
574 }
575 if (buf.length() < 1 || !ok) {
576 // syntaxError(chars, "Invalid multicharacter string");
577 ec = U_MALFORMED_SET;
578 return;
579 }
580 }
581 // We have new string. Add it to set and continue;
582 // we don't need to drop through to the further
583 // processing
584 add(buf);
585 patLocal.append((UChar) 0x7B /*'{'*/);
586 _appendToPat(patLocal, buf, FALSE);
587 patLocal.append((UChar) 0x7D /*'}'*/);
588 continue;
589 case SymbolTable::SYMBOL_REF:
590 // symbols nosymbols
591 // [a-$] error error (ambiguous)
592 // [a$] anchor anchor
593 // [a-$x] var "x"* literal '$'
594 // [a-$.] error literal '$'
595 // *We won't get here in the case of var "x"
596 {
597 chars.getPos(backup);
598 c = chars.next(opts, literal, ec);
599 if (U_FAILURE(ec)) return;
600 UBool anchor = (c == 0x5D /*']'*/ && !literal);
601 if (symbols == 0 && !anchor) {
602 c = SymbolTable::SYMBOL_REF;
603 chars.setPos(backup);
604 break; // literal '$'
605 }
606 if (anchor && op == 0) {
607 if (lastItem == 1) {
608 add(lastChar, lastChar);
609 _appendToPat(patLocal, lastChar, FALSE);
610 }
611 add(U_ETHER);
612 usePat = TRUE;
613 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
614 patLocal.append((UChar) 0x5D /*']'*/);
615 mode = 2;
616 continue;
617 }
618 // syntaxError(chars, "Unquoted '$'");
619 ec = U_MALFORMED_SET;
620 return;
621 }
622 default:
623 break;
624 }
625 }
626
627 // -------- Parse literal characters. This includes both
628 // escaped chars ("\u4E01") and non-syntax characters
629 // ("a").
630
631 switch (lastItem) {
632 case 0:
633 lastItem = 1;
634 lastChar = c;
635 break;
636 case 1:
637 if (op == HYPHEN /*'-'*/) {
638 if (lastChar >= c) {
639 // Don't allow redundant (a-a) or empty (b-a) ranges;
640 // these are most likely typos.
641 // syntaxError(chars, "Invalid range");
642 ec = U_MALFORMED_SET;
643 return;
644 }
645 add(lastChar, c);
646 _appendToPat(patLocal, lastChar, FALSE);
647 patLocal.append(op);
648 _appendToPat(patLocal, c, FALSE);
649 lastItem = 0;
650 op = 0;
651 } else {
652 add(lastChar, lastChar);
653 _appendToPat(patLocal, lastChar, FALSE);
654 lastChar = c;
655 }
656 break;
657 case 2:
658 if (op != 0) {
659 // syntaxError(chars, "Set expected after operator");
660 ec = U_MALFORMED_SET;
661 return;
662 }
663 lastChar = c;
664 lastItem = 1;
665 break;
666 }
667 }
668
669 if (mode != 2) {
670 // syntaxError(chars, "Missing ']'");
671 ec = U_MALFORMED_SET;
672 return;
673 }
674
675 chars.skipIgnored(opts);
676
677 /**
678 * Handle global flags (invert, case insensitivity). If this
679 * pattern should be compiled case-insensitive, then we need
680 * to close over case BEFORE COMPLEMENTING. This makes
681 * patterns like /[^abc]/i work.
682 */
683 if ((options & USET_CASE_INSENSITIVE) != 0) {
684 (this->*caseClosure)(USET_CASE_INSENSITIVE);
685 }
686 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
687 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
688 }
689 if (invert) {
690 complement();
691 }
692
693 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
694 // generated pattern.
695 if (usePat) {
696 rebuiltPat.append(patLocal);
697 } else {
698 _generatePattern(rebuiltPat, FALSE);
699 }
700 if (isBogus() && U_SUCCESS(ec)) {
701 // We likely ran out of memory. AHHH!
702 ec = U_MEMORY_ALLOCATION_ERROR;
703 }
704}
705
706//----------------------------------------------------------------
707// Property set implementation
708//----------------------------------------------------------------
709
Jungshik Shinaff99f52018-04-11 17:29:08 -0700710namespace {
711
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000712static UBool numericValueFilter(UChar32 ch, void* context) {
713 return u_getNumericValue(ch) == *(double*)context;
714}
715
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000716static UBool versionFilter(UChar32 ch, void* context) {
717 static const UVersionInfo none = { 0, 0, 0, 0 };
718 UVersionInfo v;
719 u_charAge(ch, v);
720 UVersionInfo* version = (UVersionInfo*)context;
721 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
722}
723
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000724static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
725 return uscript_hasScript(ch, *(UScriptCode*)context);
726}
727
Jungshik Shinaff99f52018-04-11 17:29:08 -0700728} // namespace
729
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000730/**
731 * Generic filter-based scanning code for UCD property UnicodeSets.
732 */
733void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
734 void* context,
Jungshik Shin42d50272018-10-24 01:22:09 -0700735 const UnicodeSet* inclusions,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000736 UErrorCode &status) {
737 if (U_FAILURE(status)) return;
738
739 // Logically, walk through all Unicode characters, noting the start
740 // and end of each range for which filter.contain(c) is
741 // true. Add each range to a set.
742 //
743 // To improve performance, use an inclusions set which
744 // encodes information about character ranges that are known
745 // to have identical properties.
Jungshik Shin42d50272018-10-24 01:22:09 -0700746 // inclusions contains the first characters of
747 // same-value ranges for the given property.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000748
749 clear();
750
751 UChar32 startHasProperty = -1;
752 int32_t limitRange = inclusions->getRangeCount();
753
754 for (int j=0; j<limitRange; ++j) {
755 // get current range
756 UChar32 start = inclusions->getRangeStart(j);
757 UChar32 end = inclusions->getRangeEnd(j);
758
759 // for all the code points in the range, process
760 for (UChar32 ch = start; ch <= end; ++ch) {
761 // only add to this UnicodeSet on inflection points --
762 // where the hasProperty value changes to false
763 if ((*filter)(ch, context)) {
764 if (startHasProperty < 0) {
765 startHasProperty = ch;
766 }
767 } else if (startHasProperty >= 0) {
768 add(startHasProperty, ch-1);
769 startHasProperty = -1;
770 }
771 }
772 }
773 if (startHasProperty >= 0) {
774 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
775 }
776 if (isBogus() && U_SUCCESS(status)) {
777 // We likely ran out of memory. AHHH!
778 status = U_MEMORY_ALLOCATION_ERROR;
779 }
780}
781
Jungshik Shinaff99f52018-04-11 17:29:08 -0700782namespace {
783
Jungshik Shin42d50272018-10-24 01:22:09 -0700784/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
785uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) {
786 uint32_t mask = *(const uint32_t *)context;
787 value = U_MASK(value) & mask;
788 if (value != 0) { value = 1; }
789 return value;
790}
791
792/** Maps one map value to 1, all others to 0. */
793uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) {
794 uint32_t v = *(const uint32_t *)context;
795 return value == v ? 1 : 0;
796}
797
798} // namespace
799
800void UnicodeSet::applyIntPropertyValue(const UCPMap *map,
801 UCPMapValueFilter *filter, const void *context,
802 UErrorCode &errorCode) {
803 if (U_FAILURE(errorCode)) { return; }
804 clear();
805 UChar32 start = 0, end;
806 uint32_t value;
807 while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
808 filter, context, &value)) >= 0) {
809 if (value != 0) {
810 add(start, end);
811 }
812 start = end + 1;
813 }
814 if (isBogus()) {
815 errorCode = U_MEMORY_ALLOCATION_ERROR;
816 }
817}
818
819namespace {
820
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000821static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
822 /* Note: we use ' ' in compiler code page */
823 int32_t j = 0;
824 char ch;
825 --dstCapacity; /* make room for term. zero */
826 while ((ch = *src++) != 0) {
827 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
828 continue;
829 }
830 if (j >= dstCapacity) return FALSE;
831 dst[j++] = ch;
832 }
833 if (j > 0 && dst[j-1] == ' ') --j;
834 dst[j] = 0;
835 return TRUE;
836}
837
Jungshik Shinaff99f52018-04-11 17:29:08 -0700838} // namespace
839
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000840//----------------------------------------------------------------
841// Property set API
842//----------------------------------------------------------------
843
844#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
845
846UnicodeSet&
847UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
Jungshik Shin42d50272018-10-24 01:22:09 -0700848 if (U_FAILURE(ec)) { return *this; }
849 // All of the following check isFrozen() before modifying this set.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000850 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
Jungshik Shin42d50272018-10-24 01:22:09 -0700851 const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec);
852 applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000853 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
Jungshik Shin42d50272018-10-24 01:22:09 -0700854 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000855 UScriptCode script = (UScriptCode)value;
Jungshik Shin42d50272018-10-24 01:22:09 -0700856 applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
857 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
858 if (value == 0 || value == 1) {
859 const USet *set = u_getBinaryPropertySet(prop, &ec);
860 if (U_FAILURE(ec)) { return *this; }
861 copyFrom(*UnicodeSet::fromUSet(set), TRUE);
862 if (value == 0) {
863 complement();
864 }
865 } else {
866 clear();
867 }
868 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
869 const UCPMap *map = u_getIntPropertyMap(prop, &ec);
870 applyIntPropertyValue(map, intValueFilter, &value, ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000871 } else {
Jungshik Shin42d50272018-10-24 01:22:09 -0700872 // This code used to always call getInclusions(property source)
873 // which sets an error for an unsupported property.
874 ec = U_ILLEGAL_ARGUMENT_ERROR;
875 // Otherwise we would just clear() this set because
876 // getIntPropertyValue(c, prop) returns 0 for all code points.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000877 }
878 return *this;
879}
880
881UnicodeSet&
882UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
883 const UnicodeString& value,
884 UErrorCode& ec) {
885 if (U_FAILURE(ec) || isFrozen()) return *this;
886
887 // prop and value used to be converted to char * using the default
888 // converter instead of the invariant conversion.
889 // This should not be necessary because all Unicode property and value
890 // names use only invariant characters.
891 // If there are any variant characters, then we won't find them anyway.
892 // Checking first avoids assertion failures in the conversion.
893 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
894 !uprv_isInvariantUString(value.getBuffer(), value.length())
895 ) {
896 FAIL(ec);
897 }
898 CharString pname, vname;
899 pname.appendInvariantChars(prop, ec);
900 vname.appendInvariantChars(value, ec);
901 if (U_FAILURE(ec)) return *this;
902
903 UProperty p;
904 int32_t v;
Jungshik Shinb3189662017-11-07 11:18:34 -0800905 UBool invert = FALSE;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000906
907 if (value.length() > 0) {
908 p = u_getPropertyEnum(pname.data());
909 if (p == UCHAR_INVALID_CODE) FAIL(ec);
910
911 // Treat gc as gcm
912 if (p == UCHAR_GENERAL_CATEGORY) {
913 p = UCHAR_GENERAL_CATEGORY_MASK;
914 }
915
916 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
917 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
918 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
919 v = u_getPropertyValueEnum(p, vname.data());
920 if (v == UCHAR_INVALID_CODE) {
921 // Handle numeric CCC
922 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
923 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
924 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
925 char* end;
Jungshik Shin42d50272018-10-24 01:22:09 -0700926 double val = uprv_strtod(vname.data(), &end);
Jungshik Shinb3189662017-11-07 11:18:34 -0800927 // Anything between 0 and 255 is valid even if unused.
928 // Cast double->int only after range check.
929 // We catch NaN here because comparing it with both 0 and 255 will be false
930 // (as are all comparisons with NaN).
Jungshik Shin42d50272018-10-24 01:22:09 -0700931 if (*end != 0 || !(0 <= val && val <= 255) ||
932 (v = (int32_t)val) != val) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800933 // non-integral value or outside 0..255, or trailing junk
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000934 FAIL(ec);
935 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000936 } else {
937 FAIL(ec);
938 }
939 }
940 }
941
942 else {
943
944 switch (p) {
945 case UCHAR_NUMERIC_VALUE:
946 {
947 char* end;
Jungshik Shin42d50272018-10-24 01:22:09 -0700948 double val = uprv_strtod(vname.data(), &end);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000949 if (*end != 0) {
950 FAIL(ec);
951 }
Jungshik Shin42d50272018-10-24 01:22:09 -0700952 applyFilter(numericValueFilter, &val,
953 CharacterProperties::getInclusionsForProperty(p, ec), ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000954 return *this;
955 }
956 case UCHAR_NAME:
957 {
958 // Must munge name, since u_charFromName() does not do
959 // 'loose' matching.
960 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
961 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
962 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
963 if (U_SUCCESS(ec)) {
964 clear();
965 add(ch);
966 return *this;
967 } else {
968 FAIL(ec);
969 }
970 }
971 case UCHAR_UNICODE_1_NAME:
972 // ICU 49 deprecates the Unicode_1_Name property APIs.
973 FAIL(ec);
974 case UCHAR_AGE:
975 {
976 // Must munge name, since u_versionFromString() does not do
977 // 'loose' matching.
978 char buf[128];
979 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
980 UVersionInfo version;
981 u_versionFromString(version, buf);
Jungshik Shin42d50272018-10-24 01:22:09 -0700982 applyFilter(versionFilter, &version,
983 CharacterProperties::getInclusionsForProperty(p, ec), ec);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000984 return *this;
985 }
986 case UCHAR_SCRIPT_EXTENSIONS:
987 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
988 if (v == UCHAR_INVALID_CODE) {
989 FAIL(ec);
990 }
991 // fall through to calling applyIntPropertyValue()
992 break;
993 default:
994 // p is a non-binary, non-enumerated property that we
995 // don't support (yet).
996 FAIL(ec);
997 }
998 }
999 }
1000
1001 else {
1002 // value is empty. Interpret as General Category, Script, or
1003 // Binary property.
1004 p = UCHAR_GENERAL_CATEGORY_MASK;
1005 v = u_getPropertyValueEnum(p, pname.data());
1006 if (v == UCHAR_INVALID_CODE) {
1007 p = UCHAR_SCRIPT;
1008 v = u_getPropertyValueEnum(p, pname.data());
1009 if (v == UCHAR_INVALID_CODE) {
1010 p = u_getPropertyEnum(pname.data());
1011 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1012 v = 1;
1013 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
1014 set(MIN_VALUE, MAX_VALUE);
1015 return *this;
1016 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
1017 set(0, 0x7F);
1018 return *this;
1019 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
1020 // [:Assigned:]=[:^Cn:]
1021 p = UCHAR_GENERAL_CATEGORY_MASK;
1022 v = U_GC_CN_MASK;
1023 invert = TRUE;
1024 } else {
1025 FAIL(ec);
1026 }
1027 }
1028 }
1029 }
1030
1031 applyIntPropertyValue(p, v, ec);
1032 if(invert) {
1033 complement();
1034 }
1035
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001036 if (isBogus() && U_SUCCESS(ec)) {
1037 // We likely ran out of memory. AHHH!
1038 ec = U_MEMORY_ALLOCATION_ERROR;
1039 }
1040 return *this;
1041}
1042
1043//----------------------------------------------------------------
1044// Property set patterns
1045//----------------------------------------------------------------
1046
1047/**
1048 * Return true if the given position, in the given pattern, appears
1049 * to be the start of a property set pattern.
1050 */
1051UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1052 int32_t pos) {
1053 // Patterns are at least 5 characters long
1054 if ((pos+5) > pattern.length()) {
1055 return FALSE;
1056 }
1057
1058 // Look for an opening [:, [:^, \p, or \P
1059 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1060}
1061
1062/**
1063 * Return true if the given iterator appears to point at a
1064 * property pattern. Regardless of the result, return with the
1065 * iterator unchanged.
1066 * @param chars iterator over the pattern characters. Upon return
1067 * it will be unchanged.
1068 * @param iterOpts RuleCharacterIterator options
1069 */
1070UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1071 int32_t iterOpts) {
1072 // NOTE: literal will always be FALSE, because we don't parse escapes.
1073 UBool result = FALSE, literal;
1074 UErrorCode ec = U_ZERO_ERROR;
1075 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1076 RuleCharacterIterator::Pos pos;
1077 chars.getPos(pos);
1078 UChar32 c = chars.next(iterOpts, literal, ec);
1079 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1080 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1081 literal, ec);
1082 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1083 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1084 }
1085 chars.setPos(pos);
1086 return result && U_SUCCESS(ec);
1087}
1088
1089/**
1090 * Parse the given property pattern at the given parse position.
1091 */
1092UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1093 ParsePosition& ppos,
1094 UErrorCode &ec) {
1095 int32_t pos = ppos.getIndex();
1096
1097 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1098 UBool isName = FALSE; // true for \N{pat}, o/w false
1099 UBool invert = FALSE;
1100
1101 if (U_FAILURE(ec)) return *this;
1102
1103 // Minimum length is 5 characters, e.g. \p{L}
1104 if ((pos+5) > pattern.length()) {
1105 FAIL(ec);
1106 }
1107
1108 // On entry, ppos should point to one of the following locations:
1109 // Look for an opening [:, [:^, \p, or \P
1110 if (isPOSIXOpen(pattern, pos)) {
1111 posix = TRUE;
1112 pos += 2;
1113 pos = ICU_Utility::skipWhitespace(pattern, pos);
1114 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1115 ++pos;
1116 invert = TRUE;
1117 }
1118 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1119 UChar c = pattern.charAt(pos+1);
1120 invert = (c == UPPER_P);
1121 isName = (c == UPPER_N);
1122 pos += 2;
1123 pos = ICU_Utility::skipWhitespace(pattern, pos);
1124 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1125 // Syntax error; "\p" or "\P" not followed by "{"
1126 FAIL(ec);
1127 }
1128 } else {
1129 // Open delimiter not seen
1130 FAIL(ec);
1131 }
1132
1133 // Look for the matching close delimiter, either :] or }
1134 int32_t close;
1135 if (posix) {
1136 close = pattern.indexOf(POSIX_CLOSE, 2, pos);
1137 } else {
1138 close = pattern.indexOf(CLOSE_BRACE, pos);
1139 }
1140 if (close < 0) {
1141 // Syntax error; close delimiter missing
1142 FAIL(ec);
1143 }
1144
1145 // Look for an '=' sign. If this is present, we will parse a
1146 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1147 // pattern.
1148 int32_t equals = pattern.indexOf(EQUALS, pos);
1149 UnicodeString propName, valueName;
1150 if (equals >= 0 && equals < close && !isName) {
1151 // Equals seen; parse medium/long pattern
1152 pattern.extractBetween(pos, equals, propName);
1153 pattern.extractBetween(equals+1, close, valueName);
1154 }
1155
1156 else {
1157 // Handle case where no '=' is seen, and \N{}
1158 pattern.extractBetween(pos, close, propName);
1159
1160 // Handle \N{name}
1161 if (isName) {
1162 // This is a little inefficient since it means we have to
1163 // parse NAME_PROP back to UCHAR_NAME even though we already
1164 // know it's UCHAR_NAME. If we refactor the API to
1165 // support args of (UProperty, char*) then we can remove
1166 // NAME_PROP and make this a little more efficient.
1167 valueName = propName;
1168 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1169 }
1170 }
1171
1172 applyPropertyAlias(propName, valueName, ec);
1173
1174 if (U_SUCCESS(ec)) {
1175 if (invert) {
1176 complement();
1177 }
1178
1179 // Move to the limit position after the close delimiter if the
1180 // parse succeeded.
1181 ppos.setIndex(close + (posix ? 2 : 1));
1182 }
1183
1184 return *this;
1185}
1186
1187/**
1188 * Parse a property pattern.
1189 * @param chars iterator over the pattern characters. Upon return
1190 * it will be advanced to the first character after the parsed
1191 * pattern, or the end of the iteration if all characters are
1192 * parsed.
1193 * @param rebuiltPat the pattern that was parsed, rebuilt or
1194 * copied from the input pattern, as appropriate.
1195 */
1196void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1197 UnicodeString& rebuiltPat,
1198 UErrorCode& ec) {
1199 if (U_FAILURE(ec)) return;
1200 UnicodeString pattern;
1201 chars.lookahead(pattern);
1202 ParsePosition pos(0);
1203 applyPropertyPattern(pattern, pos, ec);
1204 if (U_FAILURE(ec)) return;
1205 if (pos.getIndex() == 0) {
1206 // syntaxError(chars, "Invalid property pattern");
1207 ec = U_MALFORMED_SET;
1208 return;
1209 }
1210 chars.jumpahead(pos.getIndex());
1211 rebuiltPat.append(pattern, 0, pos.getIndex());
1212}
1213
1214U_NAMESPACE_END