Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | // |
| 4 | // regexst.h |
| 5 | // |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 6 | // Copyright (C) 2004-2015, International Business Machines Corporation and others. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 7 | // All Rights Reserved. |
| 8 | // |
| 9 | // This file contains class RegexStaticSets |
| 10 | // |
| 11 | // This class is internal to the regular expression implementation. |
| 12 | // For the public Regular Expression API, see the file "unicode/regex.h" |
| 13 | // |
| 14 | // RegexStaticSets groups together the common UnicodeSets that are needed |
| 15 | // for compiling or executing RegularExpressions. This grouping simplifies |
| 16 | // the thread safe lazy creation and sharing of these sets across |
| 17 | // all instances of regular expressions. |
| 18 | // |
| 19 | #include "unicode/utypes.h" |
| 20 | |
| 21 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 22 | |
| 23 | #include "unicode/unistr.h" |
| 24 | #include "unicode/uniset.h" |
| 25 | #include "unicode/uchar.h" |
| 26 | #include "unicode/regex.h" |
| 27 | #include "uprops.h" |
| 28 | #include "cmemory.h" |
| 29 | #include "cstring.h" |
| 30 | #include "uassert.h" |
| 31 | #include "ucln_in.h" |
| 32 | #include "umutex.h" |
| 33 | |
| 34 | #include "regexcst.h" // Contains state table for the regex pattern parser. |
| 35 | // generated by a Perl script. |
| 36 | #include "regexst.h" |
| 37 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 38 | U_NAMESPACE_BEGIN |
| 39 | |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 40 | // "Rule Char" Characters are those with special meaning, and therefore |
| 41 | // need to be escaped to appear as literals in a regexp. |
| 42 | constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 43 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 44 | // |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 45 | // The backslash escape characters that ICU's unescape() function will handle. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 46 | // |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 47 | constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 48 | |
| 49 | // |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 50 | // Unicode Set pattern for Regular Expression \w |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 51 | // |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 52 | constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 53 | |
| 54 | // |
| 55 | // Unicode Set Definitions for Regular Expression \s |
| 56 | // |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 57 | constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 58 | |
| 59 | // |
| 60 | // UnicodeSets used in implementation of Grapheme Cluster detection, \X |
| 61 | // |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 62 | constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; |
| 63 | constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; |
| 64 | constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; |
| 65 | constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; |
| 66 | constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; |
| 67 | constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; |
| 68 | constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 69 | |
| 70 | |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 71 | RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; |
Frank Tang | 1c67b4e | 2022-05-18 10:13:51 -0700 | [diff] [blame] | 72 | UInitOnce gStaticSetsInitOnce {}; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 73 | |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 74 | |
| 75 | RegexStaticSets::RegexStaticSets(UErrorCode *status) { |
| 76 | // Initialize the shared static sets to their correct values. |
| 77 | fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze(); |
| 78 | fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze(); |
| 79 | fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze(); |
| 80 | fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze(); |
| 81 | fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze(); |
| 82 | fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze(); |
| 83 | fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze(); |
| 84 | fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze(); |
| 85 | fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze(); |
| 86 | fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze(); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 87 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 88 | |
| 89 | // |
| 90 | // "Normal" is the set of characters that don't need special handling |
| 91 | // when finding grapheme cluster boundaries. |
| 92 | // |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 93 | fPropSets[URX_GC_NORMAL].complement(); |
| 94 | fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); |
| 95 | fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); |
| 96 | fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); |
| 97 | fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); |
| 98 | fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); |
| 99 | fPropSets[URX_GC_NORMAL].freeze(); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 100 | |
| 101 | // Initialize the 8-bit fast bit sets from the parallel full |
| 102 | // UnicodeSets. |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 103 | // |
| 104 | // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? |
| 105 | // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" |
| 106 | // This runs in exponential time, making it easy to adjust the time for |
| 107 | // convenient measuring. |
| 108 | // |
| 109 | // This 8 bit optimization dates from the early days of ICU, |
| 110 | // with a less optimized UnicodeSet. At the time, the difference |
| 111 | // was substantial. |
| 112 | |
| 113 | for (int32_t i=0; i<URX_LAST_SET; i++) { |
| 114 | fPropSets8[i].init(&fPropSets[i]); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 115 | } |
| 116 | |
| 117 | // Sets used while parsing rules, but not referenced from the parse state table |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 118 | fRuleSets[kRuleSet_rule_char-128] |
| 119 | .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze(); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 120 | |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 121 | fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze(); |
| 122 | fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze(); |
| 123 | fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128]; |
| 124 | |
| 125 | // Finally, initialize an empty UText string for utility purposes |
| 126 | fEmptyText = utext_openUChars(nullptr, nullptr, 0, status); |
| 127 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 128 | } |
| 129 | |
| 130 | |
| 131 | RegexStaticSets::~RegexStaticSets() { |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 132 | fRuleDigitsAlias = nullptr; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 133 | utext_close(fEmptyText); |
| 134 | } |
| 135 | |
| 136 | |
| 137 | //------------------------------------------------------------------------------ |
| 138 | // |
| 139 | // regex_cleanup Memory cleanup function, free/delete all |
| 140 | // cached memory. Called by ICU's u_cleanup() function. |
| 141 | // |
| 142 | //------------------------------------------------------------------------------ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 143 | |
| 144 | U_CDECL_BEGIN |
| 145 | static UBool U_CALLCONV |
| 146 | regex_cleanup(void) { |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 147 | delete RegexStaticSets::gStaticSets; |
| 148 | RegexStaticSets::gStaticSets = nullptr; |
| 149 | gStaticSetsInitOnce.reset(); |
| 150 | return TRUE; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 151 | } |
| 152 | |
| 153 | static void U_CALLCONV initStaticSets(UErrorCode &status) { |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 154 | U_ASSERT(RegexStaticSets::gStaticSets == nullptr); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 155 | ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup); |
| 156 | RegexStaticSets::gStaticSets = new RegexStaticSets(&status); |
| 157 | if (U_FAILURE(status)) { |
| 158 | delete RegexStaticSets::gStaticSets; |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 159 | RegexStaticSets::gStaticSets = nullptr; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 160 | } |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 161 | if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) { |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 162 | status = U_MEMORY_ALLOCATION_ERROR; |
| 163 | } |
| 164 | } |
| 165 | U_CDECL_END |
| 166 | |
| 167 | void RegexStaticSets::initGlobals(UErrorCode *status) { |
| 168 | umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status); |
| 169 | } |
| 170 | |
| 171 | U_NAMESPACE_END |
| 172 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |