Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | // |
| 4 | // file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class |
| 5 | // |
| 6 | /* |
| 7 | *************************************************************************** |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 8 | * Copyright (C) 2002-2014 International Business Machines Corporation |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 9 | * and others. All rights reserved. |
| 10 | *************************************************************************** |
| 11 | */ |
| 12 | |
| 13 | #include "unicode/utypes.h" |
| 14 | |
| 15 | #if !UCONFIG_NO_BREAK_ITERATION |
| 16 | |
| 17 | #include "unicode/unistr.h" |
| 18 | #include "unicode/uniset.h" |
| 19 | #include "unicode/uchar.h" |
| 20 | #include "unicode/parsepos.h" |
| 21 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 22 | #include "cstr.h" |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 23 | #include "rbbinode.h" |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 24 | #include "rbbirb.h" |
| 25 | #include "umutex.h" |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 26 | |
| 27 | |
| 28 | // |
| 29 | // RBBISymbolTableEntry_deleter Used by the UHashTable to delete the contents |
| 30 | // when the hash table is deleted. |
| 31 | // |
| 32 | U_CDECL_BEGIN |
| 33 | static void U_CALLCONV RBBISymbolTableEntry_deleter(void *p) { |
| 34 | icu::RBBISymbolTableEntry *px = (icu::RBBISymbolTableEntry *)p; |
| 35 | delete px; |
| 36 | } |
| 37 | U_CDECL_END |
| 38 | |
| 39 | |
| 40 | |
| 41 | U_NAMESPACE_BEGIN |
| 42 | |
| 43 | RBBISymbolTable::RBBISymbolTable(RBBIRuleScanner *rs, const UnicodeString &rules, UErrorCode &status) |
| 44 | :fRules(rules), fRuleScanner(rs), ffffString(UChar(0xffff)) |
| 45 | { |
| 46 | fHashTable = NULL; |
| 47 | fCachedSetLookup = NULL; |
| 48 | |
| 49 | fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, &status); |
| 50 | // uhash_open checks status |
| 51 | if (U_FAILURE(status)) { |
| 52 | return; |
| 53 | } |
| 54 | uhash_setValueDeleter(fHashTable, RBBISymbolTableEntry_deleter); |
| 55 | } |
| 56 | |
| 57 | |
| 58 | |
| 59 | RBBISymbolTable::~RBBISymbolTable() |
| 60 | { |
| 61 | uhash_close(fHashTable); |
| 62 | } |
| 63 | |
| 64 | |
| 65 | // |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 66 | // RBBISymbolTable::lookup This function from the abstract symbol table interface |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 67 | // looks up a variable name and returns a UnicodeString |
| 68 | // containing the substitution text. |
| 69 | // |
| 70 | // The variable name does NOT include the leading $. |
| 71 | // |
| 72 | const UnicodeString *RBBISymbolTable::lookup(const UnicodeString& s) const |
| 73 | { |
| 74 | RBBISymbolTableEntry *el; |
| 75 | RBBINode *varRefNode; |
| 76 | RBBINode *exprNode; |
| 77 | RBBINode *usetNode; |
| 78 | const UnicodeString *retString; |
| 79 | RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const |
| 80 | |
| 81 | el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &s); |
| 82 | if (el == NULL) { |
| 83 | return NULL; |
| 84 | } |
| 85 | |
| 86 | varRefNode = el->val; |
| 87 | exprNode = varRefNode->fLeftChild; // Root node of expression for variable |
| 88 | if (exprNode->fType == RBBINode::setRef) { |
| 89 | // The $variable refers to a single UnicodeSet |
| 90 | // return the ffffString, which will subsequently be interpreted as a |
| 91 | // stand-in character for the set by RBBISymbolTable::lookupMatcher() |
| 92 | usetNode = exprNode->fLeftChild; |
| 93 | This->fCachedSetLookup = usetNode->fInputSet; |
| 94 | retString = &ffffString; |
| 95 | } |
| 96 | else |
| 97 | { |
| 98 | // The variable refers to something other than just a set. |
| 99 | // return the original source string for the expression |
| 100 | retString = &exprNode->fText; |
| 101 | This->fCachedSetLookup = NULL; |
| 102 | } |
| 103 | return retString; |
| 104 | } |
| 105 | |
| 106 | |
| 107 | |
| 108 | // |
| 109 | // RBBISymbolTable::lookupMatcher This function from the abstract symbol table |
| 110 | // interface maps a single stand-in character to a |
| 111 | // pointer to a Unicode Set. The Unicode Set code uses this |
| 112 | // mechanism to get all references to the same $variable |
| 113 | // name to refer to a single common Unicode Set instance. |
| 114 | // |
| 115 | // This implementation cheats a little, and does not maintain a map of stand-in chars |
| 116 | // to sets. Instead, it takes advantage of the fact that the UnicodeSet |
| 117 | // constructor will always call this function right after calling lookup(), |
| 118 | // and we just need to remember what set to return between these two calls. |
| 119 | const UnicodeFunctor *RBBISymbolTable::lookupMatcher(UChar32 ch) const |
| 120 | { |
| 121 | UnicodeSet *retVal = NULL; |
| 122 | RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const |
| 123 | if (ch == 0xffff) { |
| 124 | retVal = fCachedSetLookup; |
| 125 | This->fCachedSetLookup = 0; |
| 126 | } |
| 127 | return retVal; |
| 128 | } |
| 129 | |
| 130 | // |
| 131 | // RBBISymbolTable::parseReference This function from the abstract symbol table interface |
| 132 | // looks for a $variable name in the source text. |
| 133 | // It does not look it up, only scans for it. |
| 134 | // It is used by the UnicodeSet parser. |
| 135 | // |
| 136 | // This implementation is lifted pretty much verbatim |
| 137 | // from the rules based transliterator implementation. |
| 138 | // I didn't see an obvious way of sharing it. |
| 139 | // |
| 140 | UnicodeString RBBISymbolTable::parseReference(const UnicodeString& text, |
| 141 | ParsePosition& pos, int32_t limit) const |
| 142 | { |
| 143 | int32_t start = pos.getIndex(); |
| 144 | int32_t i = start; |
| 145 | UnicodeString result; |
| 146 | while (i < limit) { |
| 147 | UChar c = text.charAt(i); |
| 148 | if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { |
| 149 | break; |
| 150 | } |
| 151 | ++i; |
| 152 | } |
| 153 | if (i == start) { // No valid name chars |
| 154 | return result; // Indicate failure with empty string |
| 155 | } |
| 156 | pos.setIndex(i); |
| 157 | text.extractBetween(start, i, result); |
| 158 | return result; |
| 159 | } |
| 160 | |
| 161 | |
| 162 | |
| 163 | // |
| 164 | // RBBISymbolTable::lookupNode Given a key (a variable name), return the |
| 165 | // corresponding RBBI Node. If there is no entry |
| 166 | // in the table for this name, return NULL. |
| 167 | // |
| 168 | RBBINode *RBBISymbolTable::lookupNode(const UnicodeString &key) const{ |
| 169 | |
| 170 | RBBINode *retNode = NULL; |
| 171 | RBBISymbolTableEntry *el; |
| 172 | |
| 173 | el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key); |
| 174 | if (el != NULL) { |
| 175 | retNode = el->val; |
| 176 | } |
| 177 | return retNode; |
| 178 | } |
| 179 | |
| 180 | |
| 181 | // |
| 182 | // RBBISymbolTable::addEntry Add a new entry to the symbol table. |
| 183 | // Indicate an error if the name already exists - |
| 184 | // this will only occur in the case of duplicate |
| 185 | // variable assignments. |
| 186 | // |
| 187 | void RBBISymbolTable::addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err) { |
| 188 | RBBISymbolTableEntry *e; |
| 189 | /* test for buffer overflows */ |
| 190 | if (U_FAILURE(err)) { |
| 191 | return; |
| 192 | } |
| 193 | e = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key); |
| 194 | if (e != NULL) { |
| 195 | err = U_BRK_VARIABLE_REDFINITION; |
| 196 | return; |
| 197 | } |
| 198 | |
| 199 | e = new RBBISymbolTableEntry; |
| 200 | if (e == NULL) { |
| 201 | err = U_MEMORY_ALLOCATION_ERROR; |
| 202 | return; |
| 203 | } |
| 204 | e->key = key; |
| 205 | e->val = val; |
| 206 | uhash_put( fHashTable, &e->key, e, &err); |
| 207 | } |
| 208 | |
| 209 | |
| 210 | RBBISymbolTableEntry::RBBISymbolTableEntry() : UMemory(), key(), val(NULL) {} |
| 211 | |
| 212 | RBBISymbolTableEntry::~RBBISymbolTableEntry() { |
| 213 | // The "val" of a symbol table entry is a variable reference node. |
| 214 | // The l. child of the val is the rhs expression from the assignment. |
| 215 | // Unlike other node types, children of variable reference nodes are not |
| 216 | // automatically recursively deleted. We do it manually here. |
| 217 | delete val->fLeftChild; |
| 218 | val->fLeftChild = NULL; |
| 219 | |
| 220 | delete val; |
| 221 | |
| 222 | // Note: the key UnicodeString is destructed by virtue of being in the object by value. |
| 223 | } |
| 224 | |
| 225 | |
| 226 | // |
| 227 | // RBBISymbolTable::print Debugging function, dump out the symbol table contents. |
| 228 | // |
| 229 | #ifdef RBBI_DEBUG |
| 230 | void RBBISymbolTable::rbbiSymtablePrint() const { |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 231 | RBBIDebugPrintf("Variable Definitions Symbol Table\n" |
| 232 | "Name Node serial String Val\n" |
| 233 | "-------------------------------------------------------------------\n"); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 234 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 235 | int32_t pos = UHASH_FIRST; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 236 | const UHashElement *e = NULL; |
| 237 | for (;;) { |
| 238 | e = uhash_nextElement(fHashTable, &pos); |
| 239 | if (e == NULL ) { |
| 240 | break; |
| 241 | } |
| 242 | RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; |
| 243 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 244 | RBBIDebugPrintf("%-19s %8p %7d ", CStr(s->key)(), (void *)s->val, s->val->fSerialNum); |
| 245 | RBBIDebugPrintf(" %s\n", CStr(s->val->fLeftChild->fText)()); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 246 | } |
| 247 | |
| 248 | RBBIDebugPrintf("\nParsed Variable Definitions\n"); |
| 249 | pos = -1; |
| 250 | for (;;) { |
| 251 | e = uhash_nextElement(fHashTable, &pos); |
| 252 | if (e == NULL ) { |
| 253 | break; |
| 254 | } |
| 255 | RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 256 | RBBIDebugPrintf("%s\n", CStr(s->key)()); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 257 | RBBINode::printTree(s->val, true); |
| 258 | RBBINode::printTree(s->val->fLeftChild, false); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 259 | RBBIDebugPrintf("\n"); |
| 260 | } |
| 261 | } |
| 262 | #endif |
| 263 | |
| 264 | |
| 265 | |
| 266 | |
| 267 | |
| 268 | U_NAMESPACE_END |
| 269 | |
| 270 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |