Blame - source/common/rbbi_cache.cpp - chromium.googlesource.com/chromium/deps/icu

2017-11-07 11:18:34 -0800

[diff] [blame]

1

2

// License & terms of use: http://www.unicode.org/copyright.html

3

4

// file: rbbi_cache.cpp

5

6

#include "unicode/utypes.h"

7

8

#if !UCONFIG_NO_BREAK_ITERATION

9

10

#include "unicode/ubrk.h"

11

#include "unicode/rbbi.h"

12

13

#include "rbbi_cache.h"

#include "brkeng.h"

#include "cmemory.h"

#include "rbbidata.h"

18

#include "rbbirb.h"

19

#include "uassert.h"

20

#include "uvectr32.h"

U_NAMESPACE_BEGIN

/*

* DictionaryCache implementation

26

*/

27

28

RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

29

fBI(bi), fBreaks(status), fPositionInCache(-1),

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

30

fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

31

}

32

33

RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

34

}

35

36

void RuleBasedBreakIterator::DictionaryCache::reset() {

37

fPositionInCache = -1;

38

fStart = 0;

39

fLimit = 0;

40

fFirstRuleStatusIndex = 0;

41

fOtherRuleStatusIndex = 0;

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

42

fBreaks.removeAllElements();

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

43

}

44

45

UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {

46

if (fromPos >= fLimit || fromPos < fStart) {

47

fPositionInCache = -1;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

48

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

49

}

50

51

// Sequential iteration, move from previous boundary to the following

52

53

int32_t r = 0;

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

54

if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

55

++fPositionInCache;

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

56

if (fPositionInCache >= fBreaks.size()) {

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

57

fPositionInCache = -1;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

58

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

59

}

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

60

r = fBreaks.elementAti(fPositionInCache);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

61

U_ASSERT(r > fromPos);

62

*result = r;

63

*statusIndex = fOtherRuleStatusIndex;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

64

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

65

}

66

67

// Random indexing. Linear search for the boundary following the given position.

68

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

69

for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {

70

r= fBreaks.elementAti(fPositionInCache);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

71

if (r > fromPos) {

72

*result = r;

73

*statusIndex = fOtherRuleStatusIndex;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

74

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

75

}

76

}

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

77

UPRV_UNREACHABLE_EXIT;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

}

UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {

82

if (fromPos <= fStart || fromPos > fLimit) {

83

fPositionInCache = -1;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

84

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

85

}

86

87

if (fromPos == fLimit) {

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

88

fPositionInCache = fBreaks.size() - 1;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

89

if (fPositionInCache >= 0) {

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

90

U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

}

}

int32_t r;

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

95

if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

96

--fPositionInCache;

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

97

r = fBreaks.elementAti(fPositionInCache);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

98

U_ASSERT(r < fromPos);

99

*result = r;

100

*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

101

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

102

}

103

104

if (fPositionInCache == 0) {

105

fPositionInCache = -1;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

106

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

107

}

108

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

109

for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {

110

r = fBreaks.elementAti(fPositionInCache);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

111

if (r < fromPos) {

112

*result = r;

113

*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

114

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

115

}

116

}

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

117

UPRV_UNREACHABLE_EXIT;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

118

}

119

120

void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,

121

int32_t firstRuleStatus, int32_t otherRuleStatus) {

122

if ((endPos - startPos) <= 1) {

return;

}

reset();

fFirstRuleStatusIndex = firstRuleStatus;

128

fOtherRuleStatusIndex = otherRuleStatus;

129

130

int32_t rangeStart = startPos;

131

int32_t rangeEnd = endPos;

uint16_t category;

int32_t current;

UErrorCode status = U_ZERO_ERROR;

136

int32_t foundBreakCount = 0;

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

137

UText *text = &fBI->fText;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

138

139

// Loop through the text, looking for ranges of dictionary characters.

140

// For each span, find the appropriate break engine, and ask it to find

141

// any breaks within the span.

142

143

utext_setNativeIndex(text, rangeStart);

144

UChar32 c = utext_current32(text);

Frank Tang

2020-10-30 19:02:04 -0700

[diff] [blame]

145

category = ucptrie_get(fBI->fData->fTrie, c);

146

uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

147

148

while(U_SUCCESS(status)) {

Frank Tang

2020-10-30 19:02:04 -0700

[diff] [blame]

149

while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd

150

&& (category < dictStart)) {

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

151

utext_next32(text); // TODO: cleaner loop structure.

152

c = utext_current32(text);

Frank Tang

2020-10-30 19:02:04 -0700

[diff] [blame]

153

category = ucptrie_get(fBI->fData->fTrie, c);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

154

}

155

if (current >= rangeEnd) {

break;

}

// We now have a dictionary character. Get the appropriate language object

160

// to deal with it.

161

const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);

162

163

// Ask the language object if there are any breaks. It will add them to the cache and

164

// leave the text pointer on the other side of its range, ready to search for the next one.

165

if (lbe != NULL) {

Frank Tang

d2858cb

2022-04-08 20:34:12 -0700

[diff] [blame]

166

foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

167

}

168

169

// Reload the loop variables for the next go-round

170

c = utext_current32(text);

Frank Tang

2020-10-30 19:02:04 -0700

[diff] [blame]

171

category = ucptrie_get(fBI->fData->fTrie, c);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

172

}

173

174

// If we found breaks, ensure that the first and last entries are

175

// the original starting and ending position. And initialize the

176

// cache iteration position to the first entry.

177

178

// printf("foundBreakCount = %d\n", foundBreakCount);

179

if (foundBreakCount > 0) {

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

180

U_ASSERT(foundBreakCount == fBreaks.size());

181

if (startPos < fBreaks.elementAti(0)) {

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

182

// The dictionary did not place a boundary at the start of the segment of text.

183

// Add one now. This should not commonly happen, but it would be easy for interactions

184

// of the rules for dictionary segments and the break engine implementations to

185

// inadvertently cause it. Cover it here, just in case.

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

186

fBreaks.insertElementAt(startPos, 0, status);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

187

}

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

188

if (endPos > fBreaks.peeki()) {

189

fBreaks.push(endPos, status);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

190

}

191

fPositionInCache = 0;

192

// Note: Dictionary matching may extend beyond the original limit.

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

193

fStart = fBreaks.elementAti(0);

194

fLimit = fBreaks.peeki();

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

195

} else {

196

// there were no language-based breaks, even though the segment contained

197

// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache

198

// for this range will fail, and the calling code will fall back to the rule based boundaries.

}

}

/*

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

204

* BreakCache implementation

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

205

*/

206

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

207

RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

208

fBI(bi), fSideBuffer(status) {

reset();

}

RuleBasedBreakIterator::BreakCache::~BreakCache() {

}

void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {

fStartBufIdx = 0;

fEndBufIdx = 0;

fTextIdx = pos;

fBufIdx = 0;

fBoundaries[0] = pos;

223

fStatuses[0] = (uint16_t)ruleStatus;

}

int32_t RuleBasedBreakIterator::BreakCache::current() {

228

fBI->fPosition = fTextIdx;

229

fBI->fRuleStatusIndex = fStatuses[fBufIdx];

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

230

fBI->fDone = false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

return fTextIdx;

}

void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {

236

if (U_FAILURE(status)) {

237

return;

238

}

239

if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {

240

// startPos is in the cache. Do a next() from that position.

241

// TODO: an awkward set of interactions with bi->fDone

242

// seek() does not clear it; it can't because of interactions with populateNear().

243

// next() does not clear it in the fast-path case, where everything matters. Maybe it should.

244

// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.

fBI->fDone = false;

next();

}

return;

}

void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {

253

if (U_FAILURE(status)) {

254

return;

255

}

256

if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {

257

if (startPos == fTextIdx) {

258

previous(status);

259

} else {

260

// seek() leaves the BreakCache positioned at the preceding boundary

Frank Tang

7e7574b

2021-04-13 21:19:13 -0700

[diff] [blame]

261

// if the requested position is between two boundaries.

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

262

// current() pushes the BreakCache position out to the BreakIterator itself.

263

U_ASSERT(startPos > fTextIdx);

current();

}

}

return;

}

/*

* Out-of-line code for BreakCache::next().

273

* Cache does not already contain the boundary

274

*/

275

void RuleBasedBreakIterator::BreakCache::nextOL() {

276

fBI->fDone = !populateFollowing();

277

fBI->fPosition = fTextIdx;

278

fBI->fRuleStatusIndex = fStatuses[fBufIdx];

return;

}

void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {

284

if (U_FAILURE(status)) {

285

return;

286

}

287

int32_t initialBufIdx = fBufIdx;

288

if (fBufIdx == fStartBufIdx) {

289

// At start of cache. Prepend to it.

290

populatePreceding(status);

291

} else {

292

// Cache already holds the next boundary

293

fBufIdx = modChunkSize(fBufIdx - 1);

294

fTextIdx = fBoundaries[fBufIdx];

295

}

296

fBI->fDone = (fBufIdx == initialBufIdx);

297

fBI->fPosition = fTextIdx;

298

fBI->fRuleStatusIndex = fStatuses[fBufIdx];

299

return;

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

300

}

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

301

302

303

UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {

304

if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

305

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

306

}

307

if (pos == fBoundaries[fStartBufIdx]) {

308

// Common case: seek(0), from BreakIterator::first()

309

fBufIdx = fStartBufIdx;

310

fTextIdx = fBoundaries[fBufIdx];

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

311

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

312

}

313

if (pos == fBoundaries[fEndBufIdx]) {

314

fBufIdx = fEndBufIdx;

315

fTextIdx = fBoundaries[fBufIdx];

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

316

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

317

}

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

318

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

319

int32_t min = fStartBufIdx;

320

int32_t max = fEndBufIdx;

321

while (min != max) {

322

int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;

323

probe = modChunkSize(probe);

324

if (fBoundaries[probe] > pos) {

325

max = probe;

326

} else {

327

min = modChunkSize(probe + 1);

328

}

329

}

330

U_ASSERT(fBoundaries[max] > pos);

331

fBufIdx = modChunkSize(max - 1);

332

fTextIdx = fBoundaries[fBufIdx];

333

U_ASSERT(fTextIdx <= pos);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

334

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

}

UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {

339

if (U_FAILURE(status)) {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

340

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

341

}

342

U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);

343

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

344

// Add boundaries to the cache near the specified position.

345

// The given position need not be a boundary itself.

346

// The input position must be within the range of the text, and

347

// on a code point boundary.

348

// If the requested position is a break boundary, leave the iteration

349

// position on it.

350

// If the requested position is not a boundary, leave the iteration

351

// position on the preceding boundary and include both the

352

// preceding and following boundaries in the cache.

353

// Additional boundaries, either preceding or following, may be added

354

// to the cache as a side effect.

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

355

356

// If the requested position is not near already cached positions, clear the existing cache,

357

// find a near-by boundary and begin new cache contents there.

358

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

359

// Threshold for a text position to be considered near to existing cache contents.

360

// TODO: See issue ICU-22024 "perf tuning of Cache needed."

361

// This value is subject to change. See the ticket for more details.

362

static constexpr int32_t CACHE_NEAR = 15;

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

363

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

364

int32_t aBoundary = -1;

365

int32_t ruleStatusIndex = 0;

366

bool retainCache = false;

367

if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {

368

// Requested position is near the existing cache. Retain it.

369

retainCache = true;

370

} else if (position <= CACHE_NEAR) {

371

// Requested position is near the start of the text. Fill cache from start, skipping

372

// the need to find a safe point.

retainCache = false;

aBoundary = 0;

} else {

// Requested position is not near the existing cache.

377

// Find a safe point to refill the cache from.

378

int32_t backupPos = fBI->handleSafePrevious(position);

379

380

if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {

381

// The requested position is beyond the end of the existing cache, but the

382

// reverse rules produced a position near or before the cached region.

383

// Retain the existing cache, and fill from the end of it.

384

retainCache = true;

385

} else if (backupPos < CACHE_NEAR) {

386

// The safe reverse rules moved us to near the start of text.

387

// Take that (index 0) as the backup boundary, avoiding the complication

388

// (in the following block) of moving forward from the safe point to a known boundary.

389

//

390

// Retain the cache if it begins not too far from the requested position.

391

aBoundary = 0;

392

retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));

393

} else {

394

// The safe reverse rules produced a position that is neither near the existing

395

// cache, nor near the start of text.

396

// Advance to the boundary following.

397

// There is a complication: the safe reverse rules identify pairs of code points

398

// that are safe. If advancing from the safe point moves forwards by less than

399

// two code points, we need to advance one more time to ensure that the boundary

400

// is good, including a correct rules status value.

401

retainCache = false;

402

fBI->fPosition = backupPos;

403

aBoundary = fBI->handleNext();

404

if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {

405

// +4 is a quick test for possibly having advanced only one codepoint.

406

// Four being the length of the longest potential code point, a supplementary in UTF-8

407

utext_setNativeIndex(&fBI->fText, aBoundary);

408

if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {

409

// The initial handleNext() only advanced by a single code point. Go again.

410

aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

411

}

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

412

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

413

if (aBoundary == UBRK_DONE) {

414

// Note (Andy Heninger): I don't think this condition can occur, but it's hard

415

// to prove that it can't. We ran off the end of the string looking a boundary

416

// following a safe point; choose the end of the string as that boundary.

417

aBoundary = utext_nativeLength(&fBI->fText);

418

}

419

ruleStatusIndex = fBI->fRuleStatusIndex;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

420

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

}

if (!retainCache) {

U_ASSERT(aBoundary != -1);

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

425

reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

426

}

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

427

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

428

// Fill in boundaries between existing cache content and the new requested position.

429

430

if (fBoundaries[fEndBufIdx] < position) {

431

// The last position in the cache precedes the requested position.

432

// Add following position(s) to the cache.

433

while (fBoundaries[fEndBufIdx] < position) {

434

if (!populateFollowing()) {

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

435

UPRV_UNREACHABLE_EXIT;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

436

}

437

}

438

fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.

439

fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.

440

while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.

previous(status);

}

return true;

}

if (fBoundaries[fStartBufIdx] > position) {

447

// The first position in the cache is beyond the requested position.

448

// back up more until we get a boundary <= the requested position.

449

while (fBoundaries[fStartBufIdx] > position) {

450

populatePreceding(status);

451

}

452

fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.

453

fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.

454

while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.

455

next();

456

}

457

if (fTextIdx > position) {

458

// If position is not itself a boundary, the next() loop above will overshoot.

459

// Back up one, leaving cache position at the boundary preceding the requested position.

previous(status);

}

return true;

}

U_ASSERT(fTextIdx == position);

return true;

}

UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {

472

int32_t fromPosition = fBoundaries[fEndBufIdx];

473

int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];

474

int32_t pos = 0;

475

int32_t ruleStatusIdx = 0;

476

477

if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {

478

addFollowing(pos, ruleStatusIdx, UpdateCachePosition);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

479

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

480

}

481

482

fBI->fPosition = fromPosition;

483

pos = fBI->handleNext();

484

if (pos == UBRK_DONE) {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

485

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

486

}

487

488

ruleStatusIdx = fBI->fRuleStatusIndex;

489

if (fBI->fDictionaryCharCount > 0) {

490

// The text segment obtained from the rules includes dictionary characters.

491

// Subdivide it, with subdivided results going into the dictionary cache.

492

fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);

493

if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {

494

addFollowing(pos, ruleStatusIdx, UpdateCachePosition);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

495

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

496

// TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.

497

// But be careful with interactions with populateNear().

}

}

// Rule based segment did not include dictionary characters.

502

// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,

503

// meaning that we didn't take the return, above.

504

// Add its end point to the cache.

505

addFollowing(pos, ruleStatusIdx, UpdateCachePosition);

506

507

// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.

508

// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.

509

//

510

for (int count=0; count<6; ++count) {

511

pos = fBI->handleNext();

512

if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {

513

break;

514

}

515

addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);

516

}

517

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

518

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

}

UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {

523

if (U_FAILURE(status)) {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

524

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

525

}

526

527

int32_t fromPosition = fBoundaries[fStartBufIdx];

528

if (fromPosition == 0) {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

529

return false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

530

}

531

532

int32_t position = 0;

533

int32_t positionStatusIdx = 0;

534

535

if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {

536

addPreceding(position, positionStatusIdx, UpdateCachePosition);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

537

return true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

538

}

539

540

int32_t backupPosition = fromPosition;

541

542

// Find a boundary somewhere preceding the first already-cached boundary

543

do {

544

backupPosition = backupPosition - 30;

545

if (backupPosition <= 0) {

546

backupPosition = 0;

547

} else {

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

548

backupPosition = fBI->handleSafePrevious(backupPosition);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

549

}

550

if (backupPosition == UBRK_DONE || backupPosition == 0) {

551

position = 0;

552

positionStatusIdx = 0;

553

} else {

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

554

// Advance to the boundary following the backup position.

555

// There is a complication: the safe reverse rules identify pairs of code points

556

// that are safe. If advancing from the safe point moves forwards by less than

557

// two code points, we need to advance one more time to ensure that the boundary

558

// is good, including a correct rules status value.

559

//

560

fBI->fPosition = backupPosition;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

561

position = fBI->handleNext();

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

562

if (position <= backupPosition + 4) {

563

// +4 is a quick test for possibly having advanced only one codepoint.

564

// Four being the length of the longest potential code point, a supplementary in UTF-8

565

utext_setNativeIndex(&fBI->fText, position);

566

if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {

567

// The initial handleNext() only advanced by a single code point. Go again.

568

position = fBI->handleNext(); // Safe rules identify safe pairs.

569

}

Frank Tang

b869661

2019-10-25 14:58:21 -0700

[diff] [blame]

570

}

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

571

positionStatusIdx = fBI->fRuleStatusIndex;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

572

}

573

} while (position >= fromPosition);

574

575

// Find boundaries between the one we just located and the first already-cached boundary

576

// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..

577

578

fSideBuffer.removeAllElements();

579

fSideBuffer.addElement(position, status);

580

fSideBuffer.addElement(positionStatusIdx, status);

581

582

do {

583

int32_t prevPosition = fBI->fPosition = position;

584

int32_t prevStatusIdx = positionStatusIdx;

585

position = fBI->handleNext();

586

positionStatusIdx = fBI->fRuleStatusIndex;

587

if (position == UBRK_DONE) {

break;

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

591

UBool segmentHandledByDictionary = false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

592

if (fBI->fDictionaryCharCount != 0) {

593

// Segment from the rules includes dictionary characters.

594

// Subdivide it, with subdivided results going into the dictionary cache.

595

int32_t dictSegEndPosition = position;

596

fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);

597

while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {

598

segmentHandledByDictionary = true;

599

U_ASSERT(position > prevPosition);

600

if (position >= fromPosition) {

601

break;

602

}

603

U_ASSERT(position <= dictSegEndPosition);

604

fSideBuffer.addElement(position, status);

605

fSideBuffer.addElement(positionStatusIdx, status);

606

prevPosition = position;

607

}

608

U_ASSERT(position==dictSegEndPosition || position>=fromPosition);

609

}

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

610

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

611

if (!segmentHandledByDictionary && position < fromPosition) {

612

fSideBuffer.addElement(position, status);

613

fSideBuffer.addElement(positionStatusIdx, status);

614

}

615

} while (position < fromPosition);

616

617

// Move boundaries from the side buffer to the main circular buffer.

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

618

UBool success = false;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

619

if (!fSideBuffer.isEmpty()) {

620

positionStatusIdx = fSideBuffer.popi();

621

position = fSideBuffer.popi();

622

addPreceding(position, positionStatusIdx, UpdateCachePosition);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

623

success = true;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

624

}

625

626

while (!fSideBuffer.isEmpty()) {

627

positionStatusIdx = fSideBuffer.popi();

628

position = fSideBuffer.popi();

629

if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {

630

// No space in circular buffer to hold a new preceding result while

631

// also retaining the current cache (iteration) position.

632

// Bailing out is safe; the cache will refill again if needed.

633

break;

634

}

635

}

Jungshik Shin

2018-07-07 03:36:01 -0700

[diff] [blame]

636

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

return success;

}

void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {

642

U_ASSERT(position > fBoundaries[fEndBufIdx]);

643

U_ASSERT(ruleStatusIdx <= UINT16_MAX);

644

int32_t nextIdx = modChunkSize(fEndBufIdx + 1);

645

if (nextIdx == fStartBufIdx) {

646

fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.

647

}

648

fBoundaries[nextIdx] = position;

Jungshik Shin

42d5027

2018-10-24 01:22:09 -0700

[diff] [blame]

649

fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

650

fEndBufIdx = nextIdx;

651

if (update == UpdateCachePosition) {

652

// Set current position to the newly added boundary.

fBufIdx = nextIdx;

fTextIdx = position;

} else {

// Retaining the original cache position.

657

// Check if the added boundary wraps around the buffer, and would over-write the original position.

658

// It's the responsibility of callers of this function to not add too many.

659

U_ASSERT(nextIdx != fBufIdx);

}

}

bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {

664

U_ASSERT(position < fBoundaries[fStartBufIdx]);

665

U_ASSERT(ruleStatusIdx <= UINT16_MAX);

666

int32_t nextIdx = modChunkSize(fStartBufIdx - 1);

667

if (nextIdx == fEndBufIdx) {

668

if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {

669

// Failure. The insertion of the new boundary would claim the buffer position that is the

670

// current iteration position. And we also want to retain the current iteration position.

671

// (The buffer is already completely full of entries that precede the iteration position.)

672

return false;

673

}

674

fEndBufIdx = modChunkSize(fEndBufIdx - 1);

675

}

676

fBoundaries[nextIdx] = position;

Jungshik Shin

42d5027

2018-10-24 01:22:09 -0700

[diff] [blame]

677

fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);

Jungshik Shin