Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 1 | // Copyright (C) 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | |
| 4 | // file: rbbi_cache.cpp |
| 5 | |
| 6 | #include "unicode/utypes.h" |
| 7 | |
| 8 | #if !UCONFIG_NO_BREAK_ITERATION |
| 9 | |
| 10 | #include "unicode/ubrk.h" |
| 11 | #include "unicode/rbbi.h" |
| 12 | |
| 13 | #include "rbbi_cache.h" |
| 14 | |
| 15 | #include "brkeng.h" |
| 16 | #include "cmemory.h" |
| 17 | #include "rbbidata.h" |
| 18 | #include "rbbirb.h" |
| 19 | #include "uassert.h" |
| 20 | #include "uvectr32.h" |
| 21 | |
| 22 | U_NAMESPACE_BEGIN |
| 23 | |
| 24 | /* |
| 25 | * DictionaryCache implementation |
| 26 | */ |
| 27 | |
| 28 | RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 29 | fBI(bi), fBreaks(status), fPositionInCache(-1), |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 30 | fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 31 | } |
| 32 | |
| 33 | RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 34 | } |
| 35 | |
| 36 | void RuleBasedBreakIterator::DictionaryCache::reset() { |
| 37 | fPositionInCache = -1; |
| 38 | fStart = 0; |
| 39 | fLimit = 0; |
| 40 | fFirstRuleStatusIndex = 0; |
| 41 | fOtherRuleStatusIndex = 0; |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 42 | fBreaks.removeAllElements(); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 43 | } |
| 44 | |
| 45 | UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { |
| 46 | if (fromPos >= fLimit || fromPos < fStart) { |
| 47 | fPositionInCache = -1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 48 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 49 | } |
| 50 | |
| 51 | // Sequential iteration, move from previous boundary to the following |
| 52 | |
| 53 | int32_t r = 0; |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 54 | if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 55 | ++fPositionInCache; |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 56 | if (fPositionInCache >= fBreaks.size()) { |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 57 | fPositionInCache = -1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 58 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 59 | } |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 60 | r = fBreaks.elementAti(fPositionInCache); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 61 | U_ASSERT(r > fromPos); |
| 62 | *result = r; |
| 63 | *statusIndex = fOtherRuleStatusIndex; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 64 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 65 | } |
| 66 | |
| 67 | // Random indexing. Linear search for the boundary following the given position. |
| 68 | |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 69 | for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { |
| 70 | r= fBreaks.elementAti(fPositionInCache); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 71 | if (r > fromPos) { |
| 72 | *result = r; |
| 73 | *statusIndex = fOtherRuleStatusIndex; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 74 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 75 | } |
| 76 | } |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 77 | UPRV_UNREACHABLE_EXIT; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 78 | } |
| 79 | |
| 80 | |
| 81 | UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { |
| 82 | if (fromPos <= fStart || fromPos > fLimit) { |
| 83 | fPositionInCache = -1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 84 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 85 | } |
| 86 | |
| 87 | if (fromPos == fLimit) { |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 88 | fPositionInCache = fBreaks.size() - 1; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 89 | if (fPositionInCache >= 0) { |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 90 | U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 91 | } |
| 92 | } |
| 93 | |
| 94 | int32_t r; |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 95 | if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 96 | --fPositionInCache; |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 97 | r = fBreaks.elementAti(fPositionInCache); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 98 | U_ASSERT(r < fromPos); |
| 99 | *result = r; |
| 100 | *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 101 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 102 | } |
| 103 | |
| 104 | if (fPositionInCache == 0) { |
| 105 | fPositionInCache = -1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 106 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 107 | } |
| 108 | |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 109 | for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { |
| 110 | r = fBreaks.elementAti(fPositionInCache); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 111 | if (r < fromPos) { |
| 112 | *result = r; |
| 113 | *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 114 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 115 | } |
| 116 | } |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 117 | UPRV_UNREACHABLE_EXIT; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 118 | } |
| 119 | |
| 120 | void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, |
| 121 | int32_t firstRuleStatus, int32_t otherRuleStatus) { |
| 122 | if ((endPos - startPos) <= 1) { |
| 123 | return; |
| 124 | } |
| 125 | |
| 126 | reset(); |
| 127 | fFirstRuleStatusIndex = firstRuleStatus; |
| 128 | fOtherRuleStatusIndex = otherRuleStatus; |
| 129 | |
| 130 | int32_t rangeStart = startPos; |
| 131 | int32_t rangeEnd = endPos; |
| 132 | |
| 133 | uint16_t category; |
| 134 | int32_t current; |
| 135 | UErrorCode status = U_ZERO_ERROR; |
| 136 | int32_t foundBreakCount = 0; |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 137 | UText *text = &fBI->fText; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 138 | |
| 139 | // Loop through the text, looking for ranges of dictionary characters. |
| 140 | // For each span, find the appropriate break engine, and ask it to find |
| 141 | // any breaks within the span. |
| 142 | |
| 143 | utext_setNativeIndex(text, rangeStart); |
| 144 | UChar32 c = utext_current32(text); |
Frank Tang | f90543d | 2020-10-30 19:02:04 -0700 | [diff] [blame] | 145 | category = ucptrie_get(fBI->fData->fTrie, c); |
| 146 | uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 147 | |
| 148 | while(U_SUCCESS(status)) { |
Frank Tang | f90543d | 2020-10-30 19:02:04 -0700 | [diff] [blame] | 149 | while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd |
| 150 | && (category < dictStart)) { |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 151 | utext_next32(text); // TODO: cleaner loop structure. |
| 152 | c = utext_current32(text); |
Frank Tang | f90543d | 2020-10-30 19:02:04 -0700 | [diff] [blame] | 153 | category = ucptrie_get(fBI->fData->fTrie, c); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 154 | } |
| 155 | if (current >= rangeEnd) { |
| 156 | break; |
| 157 | } |
| 158 | |
| 159 | // We now have a dictionary character. Get the appropriate language object |
| 160 | // to deal with it. |
| 161 | const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); |
| 162 | |
| 163 | // Ask the language object if there are any breaks. It will add them to the cache and |
| 164 | // leave the text pointer on the other side of its range, ready to search for the next one. |
| 165 | if (lbe != NULL) { |
Frank Tang | d2858cb | 2022-04-08 20:34:12 -0700 | [diff] [blame] | 166 | foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 167 | } |
| 168 | |
| 169 | // Reload the loop variables for the next go-round |
| 170 | c = utext_current32(text); |
Frank Tang | f90543d | 2020-10-30 19:02:04 -0700 | [diff] [blame] | 171 | category = ucptrie_get(fBI->fData->fTrie, c); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 172 | } |
| 173 | |
| 174 | // If we found breaks, ensure that the first and last entries are |
| 175 | // the original starting and ending position. And initialize the |
| 176 | // cache iteration position to the first entry. |
| 177 | |
| 178 | // printf("foundBreakCount = %d\n", foundBreakCount); |
| 179 | if (foundBreakCount > 0) { |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 180 | U_ASSERT(foundBreakCount == fBreaks.size()); |
| 181 | if (startPos < fBreaks.elementAti(0)) { |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 182 | // The dictionary did not place a boundary at the start of the segment of text. |
| 183 | // Add one now. This should not commonly happen, but it would be easy for interactions |
| 184 | // of the rules for dictionary segments and the break engine implementations to |
| 185 | // inadvertently cause it. Cover it here, just in case. |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 186 | fBreaks.insertElementAt(startPos, 0, status); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 187 | } |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 188 | if (endPos > fBreaks.peeki()) { |
| 189 | fBreaks.push(endPos, status); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 190 | } |
| 191 | fPositionInCache = 0; |
| 192 | // Note: Dictionary matching may extend beyond the original limit. |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 193 | fStart = fBreaks.elementAti(0); |
| 194 | fLimit = fBreaks.peeki(); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 195 | } else { |
| 196 | // there were no language-based breaks, even though the segment contained |
| 197 | // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache |
| 198 | // for this range will fail, and the calling code will fall back to the rule based boundaries. |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | |
| 203 | /* |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 204 | * BreakCache implementation |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 205 | */ |
| 206 | |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 207 | RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 208 | fBI(bi), fSideBuffer(status) { |
| 209 | reset(); |
| 210 | } |
| 211 | |
| 212 | |
| 213 | RuleBasedBreakIterator::BreakCache::~BreakCache() { |
| 214 | } |
| 215 | |
| 216 | |
| 217 | void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { |
| 218 | fStartBufIdx = 0; |
| 219 | fEndBufIdx = 0; |
| 220 | fTextIdx = pos; |
| 221 | fBufIdx = 0; |
| 222 | fBoundaries[0] = pos; |
| 223 | fStatuses[0] = (uint16_t)ruleStatus; |
| 224 | } |
| 225 | |
| 226 | |
| 227 | int32_t RuleBasedBreakIterator::BreakCache::current() { |
| 228 | fBI->fPosition = fTextIdx; |
| 229 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 230 | fBI->fDone = false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 231 | return fTextIdx; |
| 232 | } |
| 233 | |
| 234 | |
| 235 | void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { |
| 236 | if (U_FAILURE(status)) { |
| 237 | return; |
| 238 | } |
| 239 | if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { |
| 240 | // startPos is in the cache. Do a next() from that position. |
| 241 | // TODO: an awkward set of interactions with bi->fDone |
| 242 | // seek() does not clear it; it can't because of interactions with populateNear(). |
| 243 | // next() does not clear it in the fast-path case, where everything matters. Maybe it should. |
| 244 | // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. |
| 245 | fBI->fDone = false; |
| 246 | next(); |
| 247 | } |
| 248 | return; |
| 249 | } |
| 250 | |
| 251 | |
| 252 | void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { |
| 253 | if (U_FAILURE(status)) { |
| 254 | return; |
| 255 | } |
| 256 | if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { |
| 257 | if (startPos == fTextIdx) { |
| 258 | previous(status); |
| 259 | } else { |
| 260 | // seek() leaves the BreakCache positioned at the preceding boundary |
Frank Tang | 7e7574b | 2021-04-13 21:19:13 -0700 | [diff] [blame] | 261 | // if the requested position is between two boundaries. |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 262 | // current() pushes the BreakCache position out to the BreakIterator itself. |
| 263 | U_ASSERT(startPos > fTextIdx); |
| 264 | current(); |
| 265 | } |
| 266 | } |
| 267 | return; |
| 268 | } |
| 269 | |
| 270 | |
| 271 | /* |
| 272 | * Out-of-line code for BreakCache::next(). |
| 273 | * Cache does not already contain the boundary |
| 274 | */ |
| 275 | void RuleBasedBreakIterator::BreakCache::nextOL() { |
| 276 | fBI->fDone = !populateFollowing(); |
| 277 | fBI->fPosition = fTextIdx; |
| 278 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
| 279 | return; |
| 280 | } |
| 281 | |
| 282 | |
| 283 | void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { |
| 284 | if (U_FAILURE(status)) { |
| 285 | return; |
| 286 | } |
| 287 | int32_t initialBufIdx = fBufIdx; |
| 288 | if (fBufIdx == fStartBufIdx) { |
| 289 | // At start of cache. Prepend to it. |
| 290 | populatePreceding(status); |
| 291 | } else { |
| 292 | // Cache already holds the next boundary |
| 293 | fBufIdx = modChunkSize(fBufIdx - 1); |
| 294 | fTextIdx = fBoundaries[fBufIdx]; |
| 295 | } |
| 296 | fBI->fDone = (fBufIdx == initialBufIdx); |
| 297 | fBI->fPosition = fTextIdx; |
| 298 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
| 299 | return; |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 300 | } |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 301 | |
| 302 | |
| 303 | UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { |
| 304 | if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 305 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 306 | } |
| 307 | if (pos == fBoundaries[fStartBufIdx]) { |
| 308 | // Common case: seek(0), from BreakIterator::first() |
| 309 | fBufIdx = fStartBufIdx; |
| 310 | fTextIdx = fBoundaries[fBufIdx]; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 311 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 312 | } |
| 313 | if (pos == fBoundaries[fEndBufIdx]) { |
| 314 | fBufIdx = fEndBufIdx; |
| 315 | fTextIdx = fBoundaries[fBufIdx]; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 316 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 317 | } |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 318 | |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 319 | int32_t min = fStartBufIdx; |
| 320 | int32_t max = fEndBufIdx; |
| 321 | while (min != max) { |
| 322 | int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; |
| 323 | probe = modChunkSize(probe); |
| 324 | if (fBoundaries[probe] > pos) { |
| 325 | max = probe; |
| 326 | } else { |
| 327 | min = modChunkSize(probe + 1); |
| 328 | } |
| 329 | } |
| 330 | U_ASSERT(fBoundaries[max] > pos); |
| 331 | fBufIdx = modChunkSize(max - 1); |
| 332 | fTextIdx = fBoundaries[fBufIdx]; |
| 333 | U_ASSERT(fTextIdx <= pos); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 334 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 335 | } |
| 336 | |
| 337 | |
| 338 | UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { |
| 339 | if (U_FAILURE(status)) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 340 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 341 | } |
| 342 | U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); |
| 343 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 344 | // Add boundaries to the cache near the specified position. |
| 345 | // The given position need not be a boundary itself. |
| 346 | // The input position must be within the range of the text, and |
| 347 | // on a code point boundary. |
| 348 | // If the requested position is a break boundary, leave the iteration |
| 349 | // position on it. |
| 350 | // If the requested position is not a boundary, leave the iteration |
| 351 | // position on the preceding boundary and include both the |
| 352 | // preceding and following boundaries in the cache. |
| 353 | // Additional boundaries, either preceding or following, may be added |
| 354 | // to the cache as a side effect. |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 355 | |
| 356 | // If the requested position is not near already cached positions, clear the existing cache, |
| 357 | // find a near-by boundary and begin new cache contents there. |
| 358 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 359 | // Threshold for a text position to be considered near to existing cache contents. |
| 360 | // TODO: See issue ICU-22024 "perf tuning of Cache needed." |
| 361 | // This value is subject to change. See the ticket for more details. |
| 362 | static constexpr int32_t CACHE_NEAR = 15; |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 363 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 364 | int32_t aBoundary = -1; |
| 365 | int32_t ruleStatusIndex = 0; |
| 366 | bool retainCache = false; |
| 367 | if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { |
| 368 | // Requested position is near the existing cache. Retain it. |
| 369 | retainCache = true; |
| 370 | } else if (position <= CACHE_NEAR) { |
| 371 | // Requested position is near the start of the text. Fill cache from start, skipping |
| 372 | // the need to find a safe point. |
| 373 | retainCache = false; |
| 374 | aBoundary = 0; |
| 375 | } else { |
| 376 | // Requested position is not near the existing cache. |
| 377 | // Find a safe point to refill the cache from. |
| 378 | int32_t backupPos = fBI->handleSafePrevious(position); |
| 379 | |
| 380 | if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { |
| 381 | // The requested position is beyond the end of the existing cache, but the |
| 382 | // reverse rules produced a position near or before the cached region. |
| 383 | // Retain the existing cache, and fill from the end of it. |
| 384 | retainCache = true; |
| 385 | } else if (backupPos < CACHE_NEAR) { |
| 386 | // The safe reverse rules moved us to near the start of text. |
| 387 | // Take that (index 0) as the backup boundary, avoiding the complication |
| 388 | // (in the following block) of moving forward from the safe point to a known boundary. |
| 389 | // |
| 390 | // Retain the cache if it begins not too far from the requested position. |
| 391 | aBoundary = 0; |
| 392 | retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); |
| 393 | } else { |
| 394 | // The safe reverse rules produced a position that is neither near the existing |
| 395 | // cache, nor near the start of text. |
| 396 | // Advance to the boundary following. |
| 397 | // There is a complication: the safe reverse rules identify pairs of code points |
| 398 | // that are safe. If advancing from the safe point moves forwards by less than |
| 399 | // two code points, we need to advance one more time to ensure that the boundary |
| 400 | // is good, including a correct rules status value. |
| 401 | retainCache = false; |
| 402 | fBI->fPosition = backupPos; |
| 403 | aBoundary = fBI->handleNext(); |
| 404 | if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) { |
| 405 | // +4 is a quick test for possibly having advanced only one codepoint. |
| 406 | // Four being the length of the longest potential code point, a supplementary in UTF-8 |
| 407 | utext_setNativeIndex(&fBI->fText, aBoundary); |
| 408 | if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { |
| 409 | // The initial handleNext() only advanced by a single code point. Go again. |
| 410 | aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 411 | } |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 412 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 413 | if (aBoundary == UBRK_DONE) { |
| 414 | // Note (Andy Heninger): I don't think this condition can occur, but it's hard |
| 415 | // to prove that it can't. We ran off the end of the string looking a boundary |
| 416 | // following a safe point; choose the end of the string as that boundary. |
| 417 | aBoundary = utext_nativeLength(&fBI->fText); |
| 418 | } |
| 419 | ruleStatusIndex = fBI->fRuleStatusIndex; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 420 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 421 | } |
| 422 | |
| 423 | if (!retainCache) { |
| 424 | U_ASSERT(aBoundary != -1); |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 425 | reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 426 | } |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 427 | |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 428 | // Fill in boundaries between existing cache content and the new requested position. |
| 429 | |
| 430 | if (fBoundaries[fEndBufIdx] < position) { |
| 431 | // The last position in the cache precedes the requested position. |
| 432 | // Add following position(s) to the cache. |
| 433 | while (fBoundaries[fEndBufIdx] < position) { |
| 434 | if (!populateFollowing()) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 435 | UPRV_UNREACHABLE_EXIT; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 436 | } |
| 437 | } |
| 438 | fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. |
| 439 | fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. |
| 440 | while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. |
| 441 | previous(status); |
| 442 | } |
| 443 | return true; |
| 444 | } |
| 445 | |
| 446 | if (fBoundaries[fStartBufIdx] > position) { |
| 447 | // The first position in the cache is beyond the requested position. |
| 448 | // back up more until we get a boundary <= the requested position. |
| 449 | while (fBoundaries[fStartBufIdx] > position) { |
| 450 | populatePreceding(status); |
| 451 | } |
| 452 | fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. |
| 453 | fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. |
| 454 | while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. |
| 455 | next(); |
| 456 | } |
| 457 | if (fTextIdx > position) { |
| 458 | // If position is not itself a boundary, the next() loop above will overshoot. |
| 459 | // Back up one, leaving cache position at the boundary preceding the requested position. |
| 460 | previous(status); |
| 461 | } |
| 462 | return true; |
| 463 | } |
| 464 | |
| 465 | U_ASSERT(fTextIdx == position); |
| 466 | return true; |
| 467 | } |
| 468 | |
| 469 | |
| 470 | |
| 471 | UBool RuleBasedBreakIterator::BreakCache::populateFollowing() { |
| 472 | int32_t fromPosition = fBoundaries[fEndBufIdx]; |
| 473 | int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; |
| 474 | int32_t pos = 0; |
| 475 | int32_t ruleStatusIdx = 0; |
| 476 | |
| 477 | if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { |
| 478 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 479 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 480 | } |
| 481 | |
| 482 | fBI->fPosition = fromPosition; |
| 483 | pos = fBI->handleNext(); |
| 484 | if (pos == UBRK_DONE) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 485 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 486 | } |
| 487 | |
| 488 | ruleStatusIdx = fBI->fRuleStatusIndex; |
| 489 | if (fBI->fDictionaryCharCount > 0) { |
| 490 | // The text segment obtained from the rules includes dictionary characters. |
| 491 | // Subdivide it, with subdivided results going into the dictionary cache. |
| 492 | fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); |
| 493 | if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { |
| 494 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 495 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 496 | // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. |
| 497 | // But be careful with interactions with populateNear(). |
| 498 | } |
| 499 | } |
| 500 | |
| 501 | // Rule based segment did not include dictionary characters. |
| 502 | // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, |
| 503 | // meaning that we didn't take the return, above. |
| 504 | // Add its end point to the cache. |
| 505 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
| 506 | |
| 507 | // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. |
| 508 | // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. |
| 509 | // |
| 510 | for (int count=0; count<6; ++count) { |
| 511 | pos = fBI->handleNext(); |
| 512 | if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { |
| 513 | break; |
| 514 | } |
| 515 | addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); |
| 516 | } |
| 517 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 518 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 519 | } |
| 520 | |
| 521 | |
| 522 | UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { |
| 523 | if (U_FAILURE(status)) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 524 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 525 | } |
| 526 | |
| 527 | int32_t fromPosition = fBoundaries[fStartBufIdx]; |
| 528 | if (fromPosition == 0) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 529 | return false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 530 | } |
| 531 | |
| 532 | int32_t position = 0; |
| 533 | int32_t positionStatusIdx = 0; |
| 534 | |
| 535 | if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { |
| 536 | addPreceding(position, positionStatusIdx, UpdateCachePosition); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 537 | return true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 538 | } |
| 539 | |
| 540 | int32_t backupPosition = fromPosition; |
| 541 | |
| 542 | // Find a boundary somewhere preceding the first already-cached boundary |
| 543 | do { |
| 544 | backupPosition = backupPosition - 30; |
| 545 | if (backupPosition <= 0) { |
| 546 | backupPosition = 0; |
| 547 | } else { |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 548 | backupPosition = fBI->handleSafePrevious(backupPosition); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 549 | } |
| 550 | if (backupPosition == UBRK_DONE || backupPosition == 0) { |
| 551 | position = 0; |
| 552 | positionStatusIdx = 0; |
| 553 | } else { |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 554 | // Advance to the boundary following the backup position. |
| 555 | // There is a complication: the safe reverse rules identify pairs of code points |
| 556 | // that are safe. If advancing from the safe point moves forwards by less than |
| 557 | // two code points, we need to advance one more time to ensure that the boundary |
| 558 | // is good, including a correct rules status value. |
| 559 | // |
| 560 | fBI->fPosition = backupPosition; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 561 | position = fBI->handleNext(); |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 562 | if (position <= backupPosition + 4) { |
| 563 | // +4 is a quick test for possibly having advanced only one codepoint. |
| 564 | // Four being the length of the longest potential code point, a supplementary in UTF-8 |
| 565 | utext_setNativeIndex(&fBI->fText, position); |
| 566 | if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) { |
| 567 | // The initial handleNext() only advanced by a single code point. Go again. |
| 568 | position = fBI->handleNext(); // Safe rules identify safe pairs. |
| 569 | } |
Frank Tang | b869661 | 2019-10-25 14:58:21 -0700 | [diff] [blame] | 570 | } |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 571 | positionStatusIdx = fBI->fRuleStatusIndex; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 572 | } |
| 573 | } while (position >= fromPosition); |
| 574 | |
| 575 | // Find boundaries between the one we just located and the first already-cached boundary |
| 576 | // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. |
| 577 | |
| 578 | fSideBuffer.removeAllElements(); |
| 579 | fSideBuffer.addElement(position, status); |
| 580 | fSideBuffer.addElement(positionStatusIdx, status); |
| 581 | |
| 582 | do { |
| 583 | int32_t prevPosition = fBI->fPosition = position; |
| 584 | int32_t prevStatusIdx = positionStatusIdx; |
| 585 | position = fBI->handleNext(); |
| 586 | positionStatusIdx = fBI->fRuleStatusIndex; |
| 587 | if (position == UBRK_DONE) { |
| 588 | break; |
| 589 | } |
| 590 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 591 | UBool segmentHandledByDictionary = false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 592 | if (fBI->fDictionaryCharCount != 0) { |
| 593 | // Segment from the rules includes dictionary characters. |
| 594 | // Subdivide it, with subdivided results going into the dictionary cache. |
| 595 | int32_t dictSegEndPosition = position; |
| 596 | fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); |
| 597 | while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { |
| 598 | segmentHandledByDictionary = true; |
| 599 | U_ASSERT(position > prevPosition); |
| 600 | if (position >= fromPosition) { |
| 601 | break; |
| 602 | } |
| 603 | U_ASSERT(position <= dictSegEndPosition); |
| 604 | fSideBuffer.addElement(position, status); |
| 605 | fSideBuffer.addElement(positionStatusIdx, status); |
| 606 | prevPosition = position; |
| 607 | } |
| 608 | U_ASSERT(position==dictSegEndPosition || position>=fromPosition); |
| 609 | } |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 610 | |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 611 | if (!segmentHandledByDictionary && position < fromPosition) { |
| 612 | fSideBuffer.addElement(position, status); |
| 613 | fSideBuffer.addElement(positionStatusIdx, status); |
| 614 | } |
| 615 | } while (position < fromPosition); |
| 616 | |
| 617 | // Move boundaries from the side buffer to the main circular buffer. |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 618 | UBool success = false; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 619 | if (!fSideBuffer.isEmpty()) { |
| 620 | positionStatusIdx = fSideBuffer.popi(); |
| 621 | position = fSideBuffer.popi(); |
| 622 | addPreceding(position, positionStatusIdx, UpdateCachePosition); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 623 | success = true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 624 | } |
| 625 | |
| 626 | while (!fSideBuffer.isEmpty()) { |
| 627 | positionStatusIdx = fSideBuffer.popi(); |
| 628 | position = fSideBuffer.popi(); |
| 629 | if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { |
| 630 | // No space in circular buffer to hold a new preceding result while |
| 631 | // also retaining the current cache (iteration) position. |
| 632 | // Bailing out is safe; the cache will refill again if needed. |
| 633 | break; |
| 634 | } |
| 635 | } |
Jungshik Shin | a9a2bd3 | 2018-07-07 03:36:01 -0700 | [diff] [blame] | 636 | |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 637 | return success; |
| 638 | } |
| 639 | |
| 640 | |
| 641 | void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { |
| 642 | U_ASSERT(position > fBoundaries[fEndBufIdx]); |
| 643 | U_ASSERT(ruleStatusIdx <= UINT16_MAX); |
| 644 | int32_t nextIdx = modChunkSize(fEndBufIdx + 1); |
| 645 | if (nextIdx == fStartBufIdx) { |
| 646 | fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. |
| 647 | } |
| 648 | fBoundaries[nextIdx] = position; |
Jungshik Shin | 42d5027 | 2018-10-24 01:22:09 -0700 | [diff] [blame] | 649 | fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 650 | fEndBufIdx = nextIdx; |
| 651 | if (update == UpdateCachePosition) { |
| 652 | // Set current position to the newly added boundary. |
| 653 | fBufIdx = nextIdx; |
| 654 | fTextIdx = position; |
| 655 | } else { |
| 656 | // Retaining the original cache position. |
| 657 | // Check if the added boundary wraps around the buffer, and would over-write the original position. |
| 658 | // It's the responsibility of callers of this function to not add too many. |
| 659 | U_ASSERT(nextIdx != fBufIdx); |
| 660 | } |
| 661 | } |
| 662 | |
| 663 | bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { |
| 664 | U_ASSERT(position < fBoundaries[fStartBufIdx]); |
| 665 | U_ASSERT(ruleStatusIdx <= UINT16_MAX); |
| 666 | int32_t nextIdx = modChunkSize(fStartBufIdx - 1); |
| 667 | if (nextIdx == fEndBufIdx) { |
| 668 | if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { |
| 669 | // Failure. The insertion of the new boundary would claim the buffer position that is the |
| 670 | // current iteration position. And we also want to retain the current iteration position. |
| 671 | // (The buffer is already completely full of entries that precede the iteration position.) |
| 672 | return false; |
| 673 | } |
| 674 | fEndBufIdx = modChunkSize(fEndBufIdx - 1); |
| 675 | } |
| 676 | fBoundaries[nextIdx] = position; |
Jungshik Shin | 42d5027 | 2018-10-24 01:22:09 -0700 | [diff] [blame] | 677 | fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 678 | fStartBufIdx = nextIdx; |
| 679 | if (update == UpdateCachePosition) { |
| 680 | fBufIdx = nextIdx; |
| 681 | fTextIdx = position; |
| 682 | } |
| 683 | return true; |
| 684 | } |
| 685 | |
| 686 | |
| 687 | void RuleBasedBreakIterator::BreakCache::dumpCache() { |
| 688 | #ifdef RBBI_DEBUG |
| 689 | RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); |
| 690 | for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { |
| 691 | RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]); |
| 692 | if (i == fEndBufIdx) { |
| 693 | break; |
| 694 | } |
| 695 | } |
| 696 | #endif |
| 697 | } |
| 698 | |
| 699 | U_NAMESPACE_END |
| 700 | |
| 701 | #endif // #if !UCONFIG_NO_BREAK_ITERATION |