blob: b3c3a38a0deced6a9fbfba6d3fe0bf8525baed7d [file] [log] [blame]
Jungshik Shinb3189662017-11-07 11:18:34 -08001// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// file: rbbi_cache.cpp
5
6#include "unicode/utypes.h"
7
8#if !UCONFIG_NO_BREAK_ITERATION
9
10#include "unicode/ubrk.h"
11#include "unicode/rbbi.h"
12
13#include "rbbi_cache.h"
14
15#include "brkeng.h"
16#include "cmemory.h"
17#include "rbbidata.h"
18#include "rbbirb.h"
19#include "uassert.h"
20#include "uvectr32.h"
21
22U_NAMESPACE_BEGIN
23
24/*
25 * DictionaryCache implementation
26 */
27
28RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
29 fBI(bi), fBreaks(NULL), fPositionInCache(-1),
30 fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
31 fBreaks = new UVector32(status);
32}
33
34RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
35 delete fBreaks;
36 fBreaks = NULL;
37}
38
39void RuleBasedBreakIterator::DictionaryCache::reset() {
40 fPositionInCache = -1;
41 fStart = 0;
42 fLimit = 0;
43 fFirstRuleStatusIndex = 0;
44 fOtherRuleStatusIndex = 0;
45 fBreaks->removeAllElements();
46}
47
48UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
49 if (fromPos >= fLimit || fromPos < fStart) {
50 fPositionInCache = -1;
51 return FALSE;
52 }
53
54 // Sequential iteration, move from previous boundary to the following
55
56 int32_t r = 0;
57 if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
58 ++fPositionInCache;
59 if (fPositionInCache >= fBreaks->size()) {
60 fPositionInCache = -1;
61 return FALSE;
62 }
63 r = fBreaks->elementAti(fPositionInCache);
64 U_ASSERT(r > fromPos);
65 *result = r;
66 *statusIndex = fOtherRuleStatusIndex;
67 return TRUE;
68 }
69
70 // Random indexing. Linear search for the boundary following the given position.
71
72 for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
73 r= fBreaks->elementAti(fPositionInCache);
74 if (r > fromPos) {
75 *result = r;
76 *statusIndex = fOtherRuleStatusIndex;
77 return TRUE;
78 }
79 }
80 U_ASSERT(FALSE);
81 fPositionInCache = -1;
82 return FALSE;
83}
84
85
86UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
87 if (fromPos <= fStart || fromPos > fLimit) {
88 fPositionInCache = -1;
89 return FALSE;
90 }
91
92 if (fromPos == fLimit) {
93 fPositionInCache = fBreaks->size() - 1;
94 if (fPositionInCache >= 0) {
95 U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
96 }
97 }
98
99 int32_t r;
100 if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
101 --fPositionInCache;
102 r = fBreaks->elementAti(fPositionInCache);
103 U_ASSERT(r < fromPos);
104 *result = r;
105 *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
106 return TRUE;
107 }
108
109 if (fPositionInCache == 0) {
110 fPositionInCache = -1;
111 return FALSE;
112 }
113
114 for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
115 r = fBreaks->elementAti(fPositionInCache);
116 if (r < fromPos) {
117 *result = r;
118 *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
119 return TRUE;
120 }
121 }
122 U_ASSERT(FALSE);
123 fPositionInCache = -1;
124 return FALSE;
125}
126
127void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
128 int32_t firstRuleStatus, int32_t otherRuleStatus) {
129 if ((endPos - startPos) <= 1) {
130 return;
131 }
132
133 reset();
134 fFirstRuleStatusIndex = firstRuleStatus;
135 fOtherRuleStatusIndex = otherRuleStatus;
136
137 int32_t rangeStart = startPos;
138 int32_t rangeEnd = endPos;
139
140 uint16_t category;
141 int32_t current;
142 UErrorCode status = U_ZERO_ERROR;
143 int32_t foundBreakCount = 0;
144 UText *text = fBI->fText;
145
146 // Loop through the text, looking for ranges of dictionary characters.
147 // For each span, find the appropriate break engine, and ask it to find
148 // any breaks within the span.
149
150 utext_setNativeIndex(text, rangeStart);
151 UChar32 c = utext_current32(text);
152 category = UTRIE2_GET16(fBI->fData->fTrie, c);
153
154 while(U_SUCCESS(status)) {
155 while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) {
156 utext_next32(text); // TODO: cleaner loop structure.
157 c = utext_current32(text);
158 category = UTRIE2_GET16(fBI->fData->fTrie, c);
159 }
160 if (current >= rangeEnd) {
161 break;
162 }
163
164 // We now have a dictionary character. Get the appropriate language object
165 // to deal with it.
166 const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
167
168 // Ask the language object if there are any breaks. It will add them to the cache and
169 // leave the text pointer on the other side of its range, ready to search for the next one.
170 if (lbe != NULL) {
171 foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
172 }
173
174 // Reload the loop variables for the next go-round
175 c = utext_current32(text);
176 category = UTRIE2_GET16(fBI->fData->fTrie, c);
177 }
178
179 // If we found breaks, ensure that the first and last entries are
180 // the original starting and ending position. And initialize the
181 // cache iteration position to the first entry.
182
183 // printf("foundBreakCount = %d\n", foundBreakCount);
184 if (foundBreakCount > 0) {
185 U_ASSERT(foundBreakCount == fBreaks->size());
186 if (startPos < fBreaks->elementAti(0)) {
187 // The dictionary did not place a boundary at the start of the segment of text.
188 // Add one now. This should not commonly happen, but it would be easy for interactions
189 // of the rules for dictionary segments and the break engine implementations to
190 // inadvertently cause it. Cover it here, just in case.
191 fBreaks->insertElementAt(startPos, 0, status);
192 }
193 if (endPos > fBreaks->peeki()) {
194 fBreaks->push(endPos, status);
195 }
196 fPositionInCache = 0;
197 // Note: Dictionary matching may extend beyond the original limit.
198 fStart = fBreaks->elementAti(0);
199 fLimit = fBreaks->peeki();
200 } else {
201 // there were no language-based breaks, even though the segment contained
202 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
203 // for this range will fail, and the calling code will fall back to the rule based boundaries.
204 }
205}
206
207
208/*
209 * BreakCache implemetation
210 */
211
212RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
213 fBI(bi), fSideBuffer(status) {
214 reset();
215}
216
217
218RuleBasedBreakIterator::BreakCache::~BreakCache() {
219}
220
221
222void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
223 fStartBufIdx = 0;
224 fEndBufIdx = 0;
225 fTextIdx = pos;
226 fBufIdx = 0;
227 fBoundaries[0] = pos;
228 fStatuses[0] = (uint16_t)ruleStatus;
229}
230
231
232int32_t RuleBasedBreakIterator::BreakCache::current() {
233 fBI->fPosition = fTextIdx;
234 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
235 fBI->fDone = FALSE;
236 return fTextIdx;
237}
238
239
240void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
241 if (U_FAILURE(status)) {
242 return;
243 }
244 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
245 // startPos is in the cache. Do a next() from that position.
246 // TODO: an awkward set of interactions with bi->fDone
247 // seek() does not clear it; it can't because of interactions with populateNear().
248 // next() does not clear it in the fast-path case, where everything matters. Maybe it should.
249 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
250 fBI->fDone = false;
251 next();
252 }
253 return;
254}
255
256
257void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
258 if (U_FAILURE(status)) {
259 return;
260 }
261 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
262 if (startPos == fTextIdx) {
263 previous(status);
264 } else {
265 // seek() leaves the BreakCache positioned at the preceding boundary
266 // if the requested position is between two bounaries.
267 // current() pushes the BreakCache position out to the BreakIterator itself.
268 U_ASSERT(startPos > fTextIdx);
269 current();
270 }
271 }
272 return;
273}
274
275
276/*
277 * Out-of-line code for BreakCache::next().
278 * Cache does not already contain the boundary
279 */
280void RuleBasedBreakIterator::BreakCache::nextOL() {
281 fBI->fDone = !populateFollowing();
282 fBI->fPosition = fTextIdx;
283 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
284 return;
285}
286
287
288void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
289 if (U_FAILURE(status)) {
290 return;
291 }
292 int32_t initialBufIdx = fBufIdx;
293 if (fBufIdx == fStartBufIdx) {
294 // At start of cache. Prepend to it.
295 populatePreceding(status);
296 } else {
297 // Cache already holds the next boundary
298 fBufIdx = modChunkSize(fBufIdx - 1);
299 fTextIdx = fBoundaries[fBufIdx];
300 }
301 fBI->fDone = (fBufIdx == initialBufIdx);
302 fBI->fPosition = fTextIdx;
303 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
304 return;
305}
306
307
308UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
309 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
310 return FALSE;
311 }
312 if (pos == fBoundaries[fStartBufIdx]) {
313 // Common case: seek(0), from BreakIterator::first()
314 fBufIdx = fStartBufIdx;
315 fTextIdx = fBoundaries[fBufIdx];
316 return TRUE;
317 }
318 if (pos == fBoundaries[fEndBufIdx]) {
319 fBufIdx = fEndBufIdx;
320 fTextIdx = fBoundaries[fBufIdx];
321 return TRUE;
322 }
323
324 int32_t min = fStartBufIdx;
325 int32_t max = fEndBufIdx;
326 while (min != max) {
327 int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
328 probe = modChunkSize(probe);
329 if (fBoundaries[probe] > pos) {
330 max = probe;
331 } else {
332 min = modChunkSize(probe + 1);
333 }
334 }
335 U_ASSERT(fBoundaries[max] > pos);
336 fBufIdx = modChunkSize(max - 1);
337 fTextIdx = fBoundaries[fBufIdx];
338 U_ASSERT(fTextIdx <= pos);
339 return TRUE;
340}
341
342
343UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
344 if (U_FAILURE(status)) {
345 return FALSE;
346 }
347 U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
348
349 // Find a boundary somewhere in the vicinity of the requested position.
350 // Depending on the safe rules and the text data, it could be either before, at, or after
351 // the requested position.
352
353
354 // If the requested position is not near already cached positions, clear the existing cache,
355 // find a near-by boundary and begin new cache contents there.
356
357 if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
358 int32_t aBoundary = 0;
359 int32_t ruleStatusIndex = 0;
360 // TODO: check for position == length of text. Although may still need to back up to get rule status.
361 if (position > 20) {
362 int32_t backupPos = fBI->handlePrevious(position);
363 fBI->fPosition = backupPos;
364 aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
365 ruleStatusIndex = fBI->fRuleStatusIndex;
366 }
367 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
368 }
369
370 // Fill in boundaries between existing cache content and the new requested position.
371
372 if (fBoundaries[fEndBufIdx] < position) {
373 // The last position in the cache precedes the requested position.
374 // Add following position(s) to the cache.
375 while (fBoundaries[fEndBufIdx] < position) {
376 if (!populateFollowing()) {
377 U_ASSERT(false);
378 return false;
379 }
380 }
381 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
382 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
383 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
384 previous(status);
385 }
386 return true;
387 }
388
389 if (fBoundaries[fStartBufIdx] > position) {
390 // The first position in the cache is beyond the requested position.
391 // back up more until we get a boundary <= the requested position.
392 while (fBoundaries[fStartBufIdx] > position) {
393 populatePreceding(status);
394 }
395 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
396 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
397 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
398 next();
399 }
400 if (fTextIdx > position) {
401 // If position is not itself a boundary, the next() loop above will overshoot.
402 // Back up one, leaving cache position at the boundary preceding the requested position.
403 previous(status);
404 }
405 return true;
406 }
407
408 U_ASSERT(fTextIdx == position);
409 return true;
410}
411
412
413
414UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
415 int32_t fromPosition = fBoundaries[fEndBufIdx];
416 int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
417 int32_t pos = 0;
418 int32_t ruleStatusIdx = 0;
419
420 if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
421 addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
422 return TRUE;
423 }
424
425 fBI->fPosition = fromPosition;
426 pos = fBI->handleNext();
427 if (pos == UBRK_DONE) {
428 return FALSE;
429 }
430
431 ruleStatusIdx = fBI->fRuleStatusIndex;
432 if (fBI->fDictionaryCharCount > 0) {
433 // The text segment obtained from the rules includes dictionary characters.
434 // Subdivide it, with subdivided results going into the dictionary cache.
435 fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
436 if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
437 addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
438 return TRUE;
439 // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
440 // But be careful with interactions with populateNear().
441 }
442 }
443
444 // Rule based segment did not include dictionary characters.
445 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
446 // meaning that we didn't take the return, above.
447 // Add its end point to the cache.
448 addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
449
450 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
451 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
452 //
453 for (int count=0; count<6; ++count) {
454 pos = fBI->handleNext();
455 if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
456 break;
457 }
458 addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
459 }
460
461 return TRUE;
462}
463
464
465UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
466 if (U_FAILURE(status)) {
467 return FALSE;
468 }
469
470 int32_t fromPosition = fBoundaries[fStartBufIdx];
471 if (fromPosition == 0) {
472 return FALSE;
473 }
474
475 int32_t position = 0;
476 int32_t positionStatusIdx = 0;
477
478 if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
479 addPreceding(position, positionStatusIdx, UpdateCachePosition);
480 return TRUE;
481 }
482
483 int32_t backupPosition = fromPosition;
484
485 // Find a boundary somewhere preceding the first already-cached boundary
486 do {
487 backupPosition = backupPosition - 30;
488 if (backupPosition <= 0) {
489 backupPosition = 0;
490 } else {
491 backupPosition = fBI->handlePrevious(backupPosition);
492 }
493 if (backupPosition == UBRK_DONE || backupPosition == 0) {
494 position = 0;
495 positionStatusIdx = 0;
496 } else {
497 fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
498 position = fBI->handleNext();
499 positionStatusIdx = fBI->fRuleStatusIndex;
500
501 }
502 } while (position >= fromPosition);
503
504 // Find boundaries between the one we just located and the first already-cached boundary
505 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
506
507 fSideBuffer.removeAllElements();
508 fSideBuffer.addElement(position, status);
509 fSideBuffer.addElement(positionStatusIdx, status);
510
511 do {
512 int32_t prevPosition = fBI->fPosition = position;
513 int32_t prevStatusIdx = positionStatusIdx;
514 position = fBI->handleNext();
515 positionStatusIdx = fBI->fRuleStatusIndex;
516 if (position == UBRK_DONE) {
517 break;
518 }
519
520 UBool segmentHandledByDictionary = FALSE;
521 if (fBI->fDictionaryCharCount != 0) {
522 // Segment from the rules includes dictionary characters.
523 // Subdivide it, with subdivided results going into the dictionary cache.
524 int32_t dictSegEndPosition = position;
525 fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
526 while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
527 segmentHandledByDictionary = true;
528 U_ASSERT(position > prevPosition);
529 if (position >= fromPosition) {
530 break;
531 }
532 U_ASSERT(position <= dictSegEndPosition);
533 fSideBuffer.addElement(position, status);
534 fSideBuffer.addElement(positionStatusIdx, status);
535 prevPosition = position;
536 }
537 U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
538 }
539
540 if (!segmentHandledByDictionary && position < fromPosition) {
541 fSideBuffer.addElement(position, status);
542 fSideBuffer.addElement(positionStatusIdx, status);
543 }
544 } while (position < fromPosition);
545
546 // Move boundaries from the side buffer to the main circular buffer.
547 UBool success = FALSE;
548 if (!fSideBuffer.isEmpty()) {
549 positionStatusIdx = fSideBuffer.popi();
550 position = fSideBuffer.popi();
551 addPreceding(position, positionStatusIdx, UpdateCachePosition);
552 success = TRUE;
553 }
554
555 while (!fSideBuffer.isEmpty()) {
556 positionStatusIdx = fSideBuffer.popi();
557 position = fSideBuffer.popi();
558 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
559 // No space in circular buffer to hold a new preceding result while
560 // also retaining the current cache (iteration) position.
561 // Bailing out is safe; the cache will refill again if needed.
562 break;
563 }
564 }
565
566 return success;
567}
568
569
570void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
571 U_ASSERT(position > fBoundaries[fEndBufIdx]);
572 U_ASSERT(ruleStatusIdx <= UINT16_MAX);
573 int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
574 if (nextIdx == fStartBufIdx) {
575 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
576 }
577 fBoundaries[nextIdx] = position;
578 fStatuses[nextIdx] = ruleStatusIdx;
579 fEndBufIdx = nextIdx;
580 if (update == UpdateCachePosition) {
581 // Set current position to the newly added boundary.
582 fBufIdx = nextIdx;
583 fTextIdx = position;
584 } else {
585 // Retaining the original cache position.
586 // Check if the added boundary wraps around the buffer, and would over-write the original position.
587 // It's the responsibility of callers of this function to not add too many.
588 U_ASSERT(nextIdx != fBufIdx);
589 }
590}
591
592bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
593 U_ASSERT(position < fBoundaries[fStartBufIdx]);
594 U_ASSERT(ruleStatusIdx <= UINT16_MAX);
595 int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
596 if (nextIdx == fEndBufIdx) {
597 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
598 // Failure. The insertion of the new boundary would claim the buffer position that is the
599 // current iteration position. And we also want to retain the current iteration position.
600 // (The buffer is already completely full of entries that precede the iteration position.)
601 return false;
602 }
603 fEndBufIdx = modChunkSize(fEndBufIdx - 1);
604 }
605 fBoundaries[nextIdx] = position;
606 fStatuses[nextIdx] = ruleStatusIdx;
607 fStartBufIdx = nextIdx;
608 if (update == UpdateCachePosition) {
609 fBufIdx = nextIdx;
610 fTextIdx = position;
611 }
612 return true;
613}
614
615
616void RuleBasedBreakIterator::BreakCache::dumpCache() {
617#ifdef RBBI_DEBUG
618 RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
619 for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
620 RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]);
621 if (i == fEndBufIdx) {
622 break;
623 }
624 }
625#endif
626}
627
628U_NAMESPACE_END
629
630#endif // #if !UCONFIG_NO_BREAK_ITERATION