Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * Copyright (C) 2010-2011, International Business Machines |
| 6 | * Corporation and others. All Rights Reserved. |
| 7 | ******************************************************************************* |
| 8 | * file name: ucharstrieiterator.h |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 9 | * encoding: UTF-8 |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 10 | * tab size: 8 (not used) |
| 11 | * indentation:4 |
| 12 | * |
| 13 | * created on: 2010nov15 |
| 14 | * created by: Markus W. Scherer |
| 15 | */ |
| 16 | |
| 17 | #include "unicode/utypes.h" |
| 18 | #include "unicode/ucharstrie.h" |
| 19 | #include "unicode/unistr.h" |
| 20 | #include "uvectr32.h" |
| 21 | |
| 22 | U_NAMESPACE_BEGIN |
| 23 | |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 24 | UCharsTrie::Iterator::Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 25 | UErrorCode &errorCode) |
| 26 | : uchars_(trieUChars), |
| 27 | pos_(uchars_), initialPos_(uchars_), |
| 28 | remainingMatchLength_(-1), initialRemainingMatchLength_(-1), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 29 | skipValue_(false), |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 30 | maxLength_(maxStringLength), value_(0), stack_(NULL) { |
| 31 | if(U_FAILURE(errorCode)) { |
| 32 | return; |
| 33 | } |
| 34 | // stack_ is a pointer so that it's easy to turn ucharstrie.h into |
| 35 | // a public API header for which we would want it to depend only on |
| 36 | // other public headers. |
| 37 | // Unlike UCharsTrie itself, its Iterator performs memory allocations anyway |
| 38 | // via the UnicodeString and UVector32 implementations, so this additional |
| 39 | // cost is minimal. |
| 40 | stack_=new UVector32(errorCode); |
| 41 | if(stack_==NULL) { |
| 42 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | UCharsTrie::Iterator::Iterator(const UCharsTrie &trie, int32_t maxStringLength, |
| 47 | UErrorCode &errorCode) |
| 48 | : uchars_(trie.uchars_), pos_(trie.pos_), initialPos_(trie.pos_), |
| 49 | remainingMatchLength_(trie.remainingMatchLength_), |
| 50 | initialRemainingMatchLength_(trie.remainingMatchLength_), |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 51 | skipValue_(false), |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 52 | maxLength_(maxStringLength), value_(0), stack_(NULL) { |
| 53 | if(U_FAILURE(errorCode)) { |
| 54 | return; |
| 55 | } |
| 56 | stack_=new UVector32(errorCode); |
| 57 | if(U_FAILURE(errorCode)) { |
| 58 | return; |
| 59 | } |
| 60 | if(stack_==NULL) { |
| 61 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 62 | return; |
| 63 | } |
| 64 | int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. |
| 65 | if(length>=0) { |
| 66 | // Pending linear-match node, append remaining UChars to str_. |
| 67 | ++length; |
| 68 | if(maxLength_>0 && length>maxLength_) { |
| 69 | length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. |
| 70 | } |
| 71 | str_.append(pos_, length); |
| 72 | pos_+=length; |
| 73 | remainingMatchLength_-=length; |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | UCharsTrie::Iterator::~Iterator() { |
| 78 | delete stack_; |
| 79 | } |
| 80 | |
| 81 | UCharsTrie::Iterator & |
| 82 | UCharsTrie::Iterator::reset() { |
| 83 | pos_=initialPos_; |
| 84 | remainingMatchLength_=initialRemainingMatchLength_; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 85 | skipValue_=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 86 | int32_t length=remainingMatchLength_+1; // Remaining match length. |
| 87 | if(maxLength_>0 && length>maxLength_) { |
| 88 | length=maxLength_; |
| 89 | } |
| 90 | str_.truncate(length); |
| 91 | pos_+=length; |
| 92 | remainingMatchLength_-=length; |
| 93 | stack_->setSize(0); |
| 94 | return *this; |
| 95 | } |
| 96 | |
| 97 | UBool |
| 98 | UCharsTrie::Iterator::hasNext() const { return pos_!=NULL || !stack_->isEmpty(); } |
| 99 | |
| 100 | UBool |
| 101 | UCharsTrie::Iterator::next(UErrorCode &errorCode) { |
| 102 | if(U_FAILURE(errorCode)) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 103 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 104 | } |
| 105 | const UChar *pos=pos_; |
| 106 | if(pos==NULL) { |
| 107 | if(stack_->isEmpty()) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 108 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 109 | } |
| 110 | // Pop the state off the stack and continue with the next outbound edge of |
| 111 | // the branch node. |
| 112 | int32_t stackSize=stack_->size(); |
| 113 | int32_t length=stack_->elementAti(stackSize-1); |
| 114 | pos=uchars_+stack_->elementAti(stackSize-2); |
| 115 | stack_->setSize(stackSize-2); |
| 116 | str_.truncate(length&0xffff); |
| 117 | length=(int32_t)((uint32_t)length>>16); |
| 118 | if(length>1) { |
| 119 | pos=branchNext(pos, length, errorCode); |
| 120 | if(pos==NULL) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 121 | return true; // Reached a final value. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 122 | } |
| 123 | } else { |
| 124 | str_.append(*pos++); |
| 125 | } |
| 126 | } |
| 127 | if(remainingMatchLength_>=0) { |
| 128 | // We only get here if we started in a pending linear-match node |
| 129 | // with more than maxLength remaining units. |
| 130 | return truncateAndStop(); |
| 131 | } |
| 132 | for(;;) { |
| 133 | int32_t node=*pos++; |
| 134 | if(node>=kMinValueLead) { |
| 135 | if(skipValue_) { |
| 136 | pos=skipNodeValue(pos, node); |
| 137 | node&=kNodeTypeMask; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 138 | skipValue_=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 139 | } else { |
| 140 | // Deliver value for the string so far. |
| 141 | UBool isFinal=(UBool)(node>>15); |
| 142 | if(isFinal) { |
| 143 | value_=readValue(pos, node&0x7fff); |
| 144 | } else { |
| 145 | value_=readNodeValue(pos, node); |
| 146 | } |
| 147 | if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) { |
| 148 | pos_=NULL; |
| 149 | } else { |
| 150 | // We cannot skip the value right here because it shares its |
| 151 | // lead unit with a match node which we have to evaluate |
| 152 | // next time. |
| 153 | // Instead, keep pos_ on the node lead unit itself. |
| 154 | pos_=pos-1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 155 | skipValue_=true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 156 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 157 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 158 | } |
| 159 | } |
| 160 | if(maxLength_>0 && str_.length()==maxLength_) { |
| 161 | return truncateAndStop(); |
| 162 | } |
| 163 | if(node<kMinLinearMatch) { |
| 164 | if(node==0) { |
| 165 | node=*pos++; |
| 166 | } |
| 167 | pos=branchNext(pos, node+1, errorCode); |
| 168 | if(pos==NULL) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 169 | return true; // Reached a final value. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 170 | } |
| 171 | } else { |
| 172 | // Linear-match node, append length units to str_. |
| 173 | int32_t length=node-kMinLinearMatch+1; |
| 174 | if(maxLength_>0 && str_.length()+length>maxLength_) { |
| 175 | str_.append(pos, maxLength_-str_.length()); |
| 176 | return truncateAndStop(); |
| 177 | } |
| 178 | str_.append(pos, length); |
| 179 | pos+=length; |
| 180 | } |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | // Branch node, needs to take the first outbound edge and push state for the rest. |
| 185 | const UChar * |
| 186 | UCharsTrie::Iterator::branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode) { |
| 187 | while(length>kMaxBranchLinearSubNodeLength) { |
| 188 | ++pos; // ignore the comparison unit |
| 189 | // Push state for the greater-or-equal edge. |
| 190 | stack_->addElement((int32_t)(skipDelta(pos)-uchars_), errorCode); |
| 191 | stack_->addElement(((length-(length>>1))<<16)|str_.length(), errorCode); |
| 192 | // Follow the less-than edge. |
| 193 | length>>=1; |
| 194 | pos=jumpByDelta(pos); |
| 195 | } |
| 196 | // List of key-value pairs where values are either final values or jump deltas. |
| 197 | // Read the first (key, value) pair. |
| 198 | UChar trieUnit=*pos++; |
| 199 | int32_t node=*pos++; |
| 200 | UBool isFinal=(UBool)(node>>15); |
| 201 | int32_t value=readValue(pos, node&=0x7fff); |
| 202 | pos=skipValue(pos, node); |
| 203 | stack_->addElement((int32_t)(pos-uchars_), errorCode); |
| 204 | stack_->addElement(((length-1)<<16)|str_.length(), errorCode); |
| 205 | str_.append(trieUnit); |
| 206 | if(isFinal) { |
| 207 | pos_=NULL; |
| 208 | value_=value; |
| 209 | return NULL; |
| 210 | } else { |
| 211 | return pos+value; |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | U_NAMESPACE_END |