blob: 54b289e24d1e0affac7f32ec140046c1b57b581d [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4***************************************************************************
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07005* Copyright (C) 1999-2016 International Business Machines Corporation
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00006* and others. All rights reserved.
7***************************************************************************
8*/
9//
Jungshik Shinb3189662017-11-07 11:18:34 -080010// file: rbbi.cpp Contains the implementation of the rule based break iterator
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000011// runtime engine and the API implementation for
12// class RuleBasedBreakIterator
13//
14
15#include "utypeinfo.h" // for 'typeid' to work
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include "unicode/rbbi.h"
22#include "unicode/schriter.h"
23#include "unicode/uchriter.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000024#include "unicode/uclean.h"
Jungshik Shinb3189662017-11-07 11:18:34 -080025#include "unicode/udata.h"
26
27#include "brkeng.h"
28#include "ucln_cmn.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000029#include "cmemory.h"
30#include "cstring.h"
Jungshik Shinb3189662017-11-07 11:18:34 -080031#include "rbbidata.h"
32#include "rbbi_cache.h"
33#include "rbbirb.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000034#include "uassert.h"
Jungshik Shinb3189662017-11-07 11:18:34 -080035#include "umutex.h"
36#include "uvectr32.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000037
38// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
39#if U_LOCAL_SERVICE_HOOK
40#include "localsvc.h"
41#endif
42
43#ifdef RBBI_DEBUG
Jungshik Shinb3189662017-11-07 11:18:34 -080044static UBool gTrace = FALSE;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000045#endif
46
47U_NAMESPACE_BEGIN
48
49// The state number of the starting state
Jungshik Shinb3189662017-11-07 11:18:34 -080050constexpr int32_t START_STATE = 1;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000051
52// The state-transition value indicating "stop"
Jungshik Shinb3189662017-11-07 11:18:34 -080053constexpr int32_t STOP_STATE = 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000054
55
56UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
57
58
59//=======================================================================
60// constructors
61//=======================================================================
62
63/**
64 * Constructs a RuleBasedBreakIterator that uses the already-created
65 * tables object that is passed in as a parameter.
66 */
Jungshik Shinb3189662017-11-07 11:18:34 -080067RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) {
68 init(status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000069 fData = new RBBIDataWrapper(data, status); // status checked in constructor
70 if (U_FAILURE(status)) {return;}
71 if(fData == 0) {
72 status = U_MEMORY_ALLOCATION_ERROR;
73 return;
74 }
75}
76
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000077//
78// Construct from precompiled binary rules (tables). This constructor is public API,
79// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
80//
81RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
82 uint32_t ruleLength,
83 UErrorCode &status) {
Jungshik Shinb3189662017-11-07 11:18:34 -080084 init(status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000085 if (U_FAILURE(status)) {
86 return;
87 }
88 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
89 status = U_ILLEGAL_ARGUMENT_ERROR;
90 return;
91 }
92 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
93 if (data->fLength > ruleLength) {
94 status = U_ILLEGAL_ARGUMENT_ERROR;
95 return;
96 }
Jungshik Shinb3189662017-11-07 11:18:34 -080097 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000098 if (U_FAILURE(status)) {return;}
99 if(fData == 0) {
100 status = U_MEMORY_ALLOCATION_ERROR;
101 return;
102 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800103}
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000104
105
106//-------------------------------------------------------------------------------
107//
108// Constructor from a UDataMemory handle to precompiled break rules
109// stored in an ICU data file.
110//
111//-------------------------------------------------------------------------------
112RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
113{
Jungshik Shinb3189662017-11-07 11:18:34 -0800114 init(status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000115 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
116 if (U_FAILURE(status)) {return;}
117 if(fData == 0) {
118 status = U_MEMORY_ALLOCATION_ERROR;
119 return;
120 }
121}
122
123
124
125//-------------------------------------------------------------------------------
126//
127// Constructor from a set of rules supplied as a string.
128//
129//-------------------------------------------------------------------------------
130RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
131 UParseError &parseError,
132 UErrorCode &status)
133{
Jungshik Shinb3189662017-11-07 11:18:34 -0800134 init(status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000135 if (U_FAILURE(status)) {return;}
136 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
137 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
138 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
139 // creates and returns a complete RBBI. From here, in a constructor, we
140 // can't just return the object created by the builder factory, hence
141 // the assignment of the factory created object to "this".
142 if (U_SUCCESS(status)) {
143 *this = *bi;
144 delete bi;
145 }
146}
147
148
149//-------------------------------------------------------------------------------
150//
151// Default Constructor. Create an empty shell that can be set up later.
152// Used when creating a RuleBasedBreakIterator from a set
153// of rules.
154//-------------------------------------------------------------------------------
155RuleBasedBreakIterator::RuleBasedBreakIterator() {
Jungshik Shinb3189662017-11-07 11:18:34 -0800156 UErrorCode status = U_ZERO_ERROR;
157 init(status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000158}
159
160
161//-------------------------------------------------------------------------------
162//
163// Copy constructor. Will produce a break iterator with the same behavior,
164// and which iterates over the same text, as the one passed in.
165//
166//-------------------------------------------------------------------------------
167RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
168: BreakIterator(other)
169{
Jungshik Shinb3189662017-11-07 11:18:34 -0800170 UErrorCode status = U_ZERO_ERROR;
171 this->init(status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000172 *this = other;
173}
174
175
176/**
177 * Destructor
178 */
179RuleBasedBreakIterator::~RuleBasedBreakIterator() {
180 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
181 // fCharIter was adopted from the outside.
182 delete fCharIter;
183 }
184 fCharIter = NULL;
185 delete fSCharIter;
Jungshik Shinb3189662017-11-07 11:18:34 -0800186 fSCharIter = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000187 delete fDCharIter;
188 fDCharIter = NULL;
Jungshik Shinb3189662017-11-07 11:18:34 -0800189
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000190 utext_close(fText);
191
192 if (fData != NULL) {
193 fData->removeReference();
194 fData = NULL;
195 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800196 delete fBreakCache;
197 fBreakCache = NULL;
198
199 delete fDictionaryCache;
200 fDictionaryCache = NULL;
201
202 delete fLanguageBreakEngines;
203 fLanguageBreakEngines = NULL;
204
205 delete fUnhandledBreakEngine;
206 fUnhandledBreakEngine = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000207}
208
209/**
210 * Assignment operator. Sets this iterator to have the same behavior,
211 * and iterate over the same text, as the one passed in.
212 */
213RuleBasedBreakIterator&
214RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
215 if (this == &that) {
216 return *this;
217 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800218 BreakIterator::operator=(that);
219
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000220 fBreakType = that.fBreakType;
221 if (fLanguageBreakEngines != NULL) {
222 delete fLanguageBreakEngines;
223 fLanguageBreakEngines = NULL; // Just rebuild for now
224 }
225 // TODO: clone fLanguageBreakEngines from "that"
226 UErrorCode status = U_ZERO_ERROR;
227 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
228
229 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
230 delete fCharIter;
231 }
232 fCharIter = NULL;
233
234 if (that.fCharIter != NULL ) {
235 // This is a little bit tricky - it will intially appear that
236 // this->fCharIter is adopted, even if that->fCharIter was
237 // not adopted. That's ok.
238 fCharIter = that.fCharIter->clone();
239 }
240
241 if (fData != NULL) {
242 fData->removeReference();
243 fData = NULL;
244 }
245 if (that.fData != NULL) {
246 fData = that.fData->addReference();
247 }
248
Jungshik Shinb3189662017-11-07 11:18:34 -0800249 fPosition = that.fPosition;
250 fRuleStatusIndex = that.fRuleStatusIndex;
251 fDone = that.fDone;
252
253 // TODO: both the dictionary and the main cache need to be copied.
254 // Current position could be within a dictionary range. Trying to continue
255 // the iteration without the caches present would go to the rules, with
256 // the assumption that the current position is on a rule boundary.
257 fBreakCache->reset(fPosition, fRuleStatusIndex);
258 fDictionaryCache->reset();
259
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000260 return *this;
261}
262
263
264
265//-----------------------------------------------------------------------------
266//
267// init() Shared initialization routine. Used by all the constructors.
268// Initializes all fields, leaving the object in a consistent state.
269//
270//-----------------------------------------------------------------------------
Jungshik Shinb3189662017-11-07 11:18:34 -0800271void RuleBasedBreakIterator::init(UErrorCode &status) {
272 fText = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000273 fCharIter = NULL;
274 fSCharIter = NULL;
275 fDCharIter = NULL;
276 fData = NULL;
Jungshik Shinb3189662017-11-07 11:18:34 -0800277 fPosition = 0;
278 fRuleStatusIndex = 0;
279 fDone = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000280 fDictionaryCharCount = 0;
281 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
282 // dictionary behavior for Break Iterators that are
283 // built from rules. Even better would be the ability to
284 // declare the type in the rules.
285
Jungshik Shinb3189662017-11-07 11:18:34 -0800286 fLanguageBreakEngines = NULL;
287 fUnhandledBreakEngine = NULL;
288 fBreakCache = NULL;
289 fDictionaryCache = NULL;
290
291 if (U_FAILURE(status)) {
292 return;
293 }
294
295 fText = utext_openUChars(NULL, NULL, 0, &status);
296 fDictionaryCache = new DictionaryCache(this, status);
297 fBreakCache = new BreakCache(this, status);
298 if (U_SUCCESS(status) && (fText == NULL || fDictionaryCache == NULL || fBreakCache == NULL)) {
299 status = U_MEMORY_ALLOCATION_ERROR;
300 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000301
302#ifdef RBBI_DEBUG
303 static UBool debugInitDone = FALSE;
304 if (debugInitDone == FALSE) {
305 char *debugEnv = getenv("U_RBBIDEBUG");
306 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800307 gTrace = TRUE;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000308 }
309 debugInitDone = TRUE;
310 }
311#endif
312}
313
314
315
316//-----------------------------------------------------------------------------
317//
318// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
319// behavior, and iterating over the same text, as this one.
320// Virtual function: does the right thing with subclasses.
321//
322//-----------------------------------------------------------------------------
323BreakIterator*
324RuleBasedBreakIterator::clone(void) const {
325 return new RuleBasedBreakIterator(*this);
326}
327
328/**
329 * Equality operator. Returns TRUE if both BreakIterators are of the
330 * same class, have the same behavior, and iterate over the same text.
331 */
332UBool
333RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
334 if (typeid(*this) != typeid(that)) {
335 return FALSE;
336 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800337 if (this == &that) {
338 return TRUE;
339 }
340
341 // The base class BreakIterator carries no state that participates in equality,
342 // and does not implement an equality function that would otherwise be
343 // checked at this point.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000344
345 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
346
347 if (!utext_equals(fText, that2.fText)) {
348 // The two break iterators are operating on different text,
Jungshik Shinb3189662017-11-07 11:18:34 -0800349 // or have a different iteration position.
350 // Note that fText's position is always the same as the break iterator's position.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000351 return FALSE;
352 };
353
Jungshik Shinb3189662017-11-07 11:18:34 -0800354 if (!(fPosition == that2.fPosition &&
355 fRuleStatusIndex == that2.fRuleStatusIndex &&
356 fDone == that2.fDone)) {
357 return FALSE;
358 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000359
360 if (that2.fData == fData ||
361 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
362 // The two break iterators are using the same rules.
363 return TRUE;
364 }
365 return FALSE;
366}
367
368/**
369 * Compute a hash code for this BreakIterator
370 * @return A hash code
371 */
372int32_t
373RuleBasedBreakIterator::hashCode(void) const {
374 int32_t hash = 0;
375 if (fData != NULL) {
376 hash = fData->hashCode();
377 }
378 return hash;
379}
380
381
382void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
383 if (U_FAILURE(status)) {
384 return;
385 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800386 fBreakCache->reset();
387 fDictionaryCache->reset();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000388 fText = utext_clone(fText, ut, FALSE, TRUE, &status);
389
390 // Set up a dummy CharacterIterator to be returned if anyone
391 // calls getText(). With input from UText, there is no reasonable
392 // way to return a characterIterator over the actual input text.
393 // Return one over an empty string instead - this is the closest
394 // we can come to signaling a failure.
395 // (GetText() is obsolete, this failure is sort of OK)
396 if (fDCharIter == NULL) {
397 static const UChar c = 0;
398 fDCharIter = new UCharCharacterIterator(&c, 0);
399 if (fDCharIter == NULL) {
400 status = U_MEMORY_ALLOCATION_ERROR;
401 return;
402 }
403 }
404
405 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
406 // existing fCharIter was adopted from the outside. Delete it now.
407 delete fCharIter;
408 }
409 fCharIter = fDCharIter;
410
411 this->first();
412}
413
414
415UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
Jungshik Shinb3189662017-11-07 11:18:34 -0800416 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000417 return result;
418}
419
420
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000421//=======================================================================
422// BreakIterator overrides
423//=======================================================================
424
425/**
Jungshik Shinb3189662017-11-07 11:18:34 -0800426 * Return a CharacterIterator over the text being analyzed.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000427 */
428CharacterIterator&
429RuleBasedBreakIterator::getText() const {
430 return *fCharIter;
431}
432
433/**
434 * Set the iterator to analyze a new piece of text. This function resets
435 * the current iteration position to the beginning of the text.
436 * @param newText An iterator over the text to analyze.
437 */
438void
439RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800440 // If we are holding a CharacterIterator adopted from a
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000441 // previous call to this function, delete it now.
442 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
443 delete fCharIter;
444 }
445
446 fCharIter = newText;
447 UErrorCode status = U_ZERO_ERROR;
Jungshik Shinb3189662017-11-07 11:18:34 -0800448 fBreakCache->reset();
449 fDictionaryCache->reset();
450 if (newText==NULL || newText->startIndex() != 0) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000451 // startIndex !=0 wants to be an error, but there's no way to report it.
452 // Make the iterator text be an empty string.
453 fText = utext_openUChars(fText, NULL, 0, &status);
454 } else {
455 fText = utext_openCharacterIterator(fText, newText, &status);
456 }
457 this->first();
458}
459
460/**
461 * Set the iterator to analyze a new piece of text. This function resets
462 * the current iteration position to the beginning of the text.
463 * @param newText An iterator over the text to analyze.
464 */
465void
466RuleBasedBreakIterator::setText(const UnicodeString& newText) {
467 UErrorCode status = U_ZERO_ERROR;
Jungshik Shinb3189662017-11-07 11:18:34 -0800468 fBreakCache->reset();
469 fDictionaryCache->reset();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000470 fText = utext_openConstUnicodeString(fText, &newText, &status);
471
Jungshik Shinb3189662017-11-07 11:18:34 -0800472 // Set up a character iterator on the string.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000473 // Needed in case someone calls getText().
474 // Can not, unfortunately, do this lazily on the (probably never)
475 // call to getText(), because getText is const.
476 if (fSCharIter == NULL) {
477 fSCharIter = new StringCharacterIterator(newText);
478 } else {
479 fSCharIter->setText(newText);
480 }
481
482 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
483 // old fCharIter was adopted from the outside. Delete it.
484 delete fCharIter;
485 }
486 fCharIter = fSCharIter;
487
488 this->first();
489}
490
491
492/**
493 * Provide a new UText for the input text. Must reference text with contents identical
494 * to the original.
495 * Intended for use with text data originating in Java (garbage collected) environments
496 * where the data may be moved in memory at arbitrary times.
497 */
498RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
499 if (U_FAILURE(status)) {
500 return *this;
501 }
502 if (input == NULL) {
503 status = U_ILLEGAL_ARGUMENT_ERROR;
504 return *this;
505 }
506 int64_t pos = utext_getNativeIndex(fText);
507 // Shallow read-only clone of the new UText into the existing input UText
508 fText = utext_clone(fText, input, FALSE, TRUE, &status);
509 if (U_FAILURE(status)) {
510 return *this;
511 }
512 utext_setNativeIndex(fText, pos);
513 if (utext_getNativeIndex(fText) != pos) {
514 // Sanity check. The new input utext is supposed to have the exact same
515 // contents as the old. If we can't set to the same position, it doesn't.
516 // The contents underlying the old utext might be invalid at this point,
517 // so it's not safe to check directly.
518 status = U_ILLEGAL_ARGUMENT_ERROR;
519 }
520 return *this;
521}
522
523
524/**
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800525 * Sets the current iteration position to the beginning of the text, position zero.
526 * @return The new iterator position, which is zero.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000527 */
528int32_t RuleBasedBreakIterator::first(void) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800529 UErrorCode status = U_ZERO_ERROR;
530 if (!fBreakCache->seek(0)) {
531 fBreakCache->populateNear(0, status);
532 }
533 fBreakCache->current();
534 U_ASSERT(fPosition == 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000535 return 0;
536}
537
538/**
539 * Sets the current iteration position to the end of the text.
540 * @return The text's past-the-end offset.
541 */
542int32_t RuleBasedBreakIterator::last(void) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800543 int32_t endPos = (int32_t)utext_nativeLength(fText);
544 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
545 (void)endShouldBeBoundary;
546 U_ASSERT(endShouldBeBoundary);
547 U_ASSERT(fPosition == endPos);
548 return endPos;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000549}
550
551/**
552 * Advances the iterator either forward or backward the specified number of steps.
553 * Negative values move backward, and positive values move forward. This is
554 * equivalent to repeatedly calling next() or previous().
555 * @param n The number of steps to move. The sign indicates the direction
556 * (negative is backwards, and positive is forwards).
557 * @return The character offset of the boundary position n boundaries away from
558 * the current one.
559 */
560int32_t RuleBasedBreakIterator::next(int32_t n) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800561 int32_t result = 0;
562 if (n > 0) {
563 for (; n > 0 && result != UBRK_DONE; --n) {
564 result = next();
565 }
566 } else if (n < 0) {
567 for (; n < 0 && result != UBRK_DONE; ++n) {
568 result = previous();
569 }
570 } else {
571 result = current();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000572 }
573 return result;
574}
575
576/**
577 * Advances the iterator to the next boundary position.
578 * @return The position of the first boundary after this one.
579 */
580int32_t RuleBasedBreakIterator::next(void) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800581 fBreakCache->next();
582 return fDone ? UBRK_DONE : fPosition;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000583}
584
585/**
Jungshik Shinb3189662017-11-07 11:18:34 -0800586 * Move the iterator backwards, to the boundary preceding the current one.
587 *
588 * Starts from the current position within fText.
589 * Starting position need not be on a boundary.
590 *
591 * @return The position of the boundary position immediately preceding the starting position.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000592 */
593int32_t RuleBasedBreakIterator::previous(void) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800594 UErrorCode status = U_ZERO_ERROR;
595 fBreakCache->previous(status);
596 return fDone ? UBRK_DONE : fPosition;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000597}
598
599/**
600 * Sets the iterator to refer to the first boundary position following
601 * the specified position.
Jungshik Shinb3189662017-11-07 11:18:34 -0800602 * @param startPos The position from which to begin searching for a break position.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000603 * @return The position of the first break after the current position.
604 */
Jungshik Shinb3189662017-11-07 11:18:34 -0800605int32_t RuleBasedBreakIterator::following(int32_t startPos) {
606 // if the supplied position is before the beginning, return the
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800607 // text's starting offset
Jungshik Shinb3189662017-11-07 11:18:34 -0800608 if (startPos < 0) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800609 return first();
610 }
611
612 // Move requested offset to a code point start. It might be on a trail surrogate,
Jungshik Shinb3189662017-11-07 11:18:34 -0800613 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
614 utext_setNativeIndex(fText, startPos);
615 startPos = (int32_t)utext_getNativeIndex(fText);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800616
Jungshik Shinb3189662017-11-07 11:18:34 -0800617 UErrorCode status = U_ZERO_ERROR;
618 fBreakCache->following(startPos, status);
619 return fDone ? UBRK_DONE : fPosition;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000620}
621
622/**
623 * Sets the iterator to refer to the last boundary position before the
624 * specified position.
Jungshik Shinb3189662017-11-07 11:18:34 -0800625 * @param offset The position to begin searching for a break from.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000626 * @return The position of the last boundary before the starting position.
627 */
628int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800629 if (fText == NULL || offset > utext_nativeLength(fText)) {
630 return last();
631 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800632
633 // Move requested offset to a code point start. It might be on a trail surrogate,
634 // or on a trail byte if the input is UTF-8.
Jungshik Shinb3189662017-11-07 11:18:34 -0800635
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800636 utext_setNativeIndex(fText, offset);
Jungshik Shinb3189662017-11-07 11:18:34 -0800637 int32_t adjustedOffset = utext_getNativeIndex(fText);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800638
Jungshik Shinb3189662017-11-07 11:18:34 -0800639 UErrorCode status = U_ZERO_ERROR;
640 fBreakCache->preceding(adjustedOffset, status);
641 return fDone ? UBRK_DONE : fPosition;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000642}
643
644/**
645 * Returns true if the specfied position is a boundary position. As a side
646 * effect, leaves the iterator pointing to the first boundary position at
647 * or after "offset".
Jungshik Shinb3189662017-11-07 11:18:34 -0800648 *
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000649 * @param offset the offset to check.
650 * @return True if "offset" is a boundary position.
651 */
652UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000653 // out-of-range indexes are never boundary positions
654 if (offset < 0) {
655 first(); // For side effects on current position, tag values.
656 return FALSE;
657 }
658
Jungshik Shinb3189662017-11-07 11:18:34 -0800659 // Adjust offset to be on a code point boundary and not beyond the end of the text.
660 // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
661 // But we still need the side effect of leaving iteration at the following boundary.
662
663 utext_setNativeIndex(fText, offset);
664 int32_t adjustedOffset = utext_getNativeIndex(fText);
665
666 bool result = false;
667 UErrorCode status = U_ZERO_ERROR;
668 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
669 result = (fBreakCache->current() == offset);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000670 }
671
Jungshik Shinb3189662017-11-07 11:18:34 -0800672 if (result && adjustedOffset < offset && utext_char32At(fText, offset) == U_SENTINEL) {
673 // Original offset is beyond the end of the text. Return FALSE, it's not a boundary,
674 // but the iteration position remains set to the end of the text, which is a boundary.
675 return FALSE;
676 }
677 if (!result) {
678 // Not on a boundary. isBoundary() must leave iterator on the following boundary.
679 // Cache->seek(), above, left us on the preceding boundary, so advance one.
680 next();
681 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000682 return result;
683}
684
Jungshik Shinb3189662017-11-07 11:18:34 -0800685
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000686/**
687 * Returns the current iteration position.
688 * @return The current iteration position.
689 */
690int32_t RuleBasedBreakIterator::current(void) const {
Jungshik Shinb3189662017-11-07 11:18:34 -0800691 return fPosition;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000692}
Jungshik Shinb3189662017-11-07 11:18:34 -0800693
694
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000695//=======================================================================
696// implementation
697//=======================================================================
698
699//
700// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
701// of user text. A variable with this enum type keeps track of where we
702// are. The state machine only fetches user input while in the RUN mode.
703//
704enum RBBIRunMode {
705 RBBI_START, // state machine processing is before first char of input
706 RBBI_RUN, // state machine processing is in the user text
707 RBBI_END // state machine processing is after end of user text.
708};
709
710
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700711// Map from look-ahead break states (corresponds to rules) to boundary positions.
712// Allows multiple lookahead break rules to be in flight at the same time.
713//
714// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
715// in the state table be sequential, then we can just index an array. And the
716// table could also tell us in advance how big that array needs to be.
717//
718// Before ICU 57 there was just a single simple variable for a look-ahead match that
719// was in progress. Two rules at once did not work.
720
721static const int32_t kMaxLookaheads = 8;
722struct LookAheadResults {
723 int32_t fUsedSlotLimit;
724 int32_t fPositions[8];
725 int16_t fKeys[8];
726
727 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
728
729 int32_t getPosition(int16_t key) {
730 for (int32_t i=0; i<fUsedSlotLimit; ++i) {
731 if (fKeys[i] == key) {
732 return fPositions[i];
733 }
734 }
735 U_ASSERT(FALSE);
736 return -1;
737 }
738
739 void setPosition(int16_t key, int32_t position) {
740 int32_t i;
741 for (i=0; i<fUsedSlotLimit; ++i) {
742 if (fKeys[i] == key) {
743 fPositions[i] = position;
744 return;
745 }
746 }
747 if (i >= kMaxLookaheads) {
748 U_ASSERT(FALSE);
749 i = kMaxLookaheads - 1;
750 }
751 fKeys[i] = key;
752 fPositions[i] = position;
753 U_ASSERT(fUsedSlotLimit == i);
754 fUsedSlotLimit = i + 1;
755 }
756};
757
758
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000759//-----------------------------------------------------------------------------------
760//
Jungshik Shinb3189662017-11-07 11:18:34 -0800761// handleNext()
762// Run the state machine to find a boundary
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000763//
764//-----------------------------------------------------------------------------------
Jungshik Shinb3189662017-11-07 11:18:34 -0800765int32_t RuleBasedBreakIterator::handleNext() {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000766 int32_t state;
767 uint16_t category = 0;
768 RBBIRunMode mode;
Jungshik Shinb3189662017-11-07 11:18:34 -0800769
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000770 RBBIStateTableRow *row;
771 UChar32 c;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700772 LookAheadResults lookAheadMatches;
773 int32_t result = 0;
774 int32_t initialPosition = 0;
Jungshik Shinb3189662017-11-07 11:18:34 -0800775 const RBBIStateTable *statetable = fData->fForwardTable;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700776 const char *tableData = statetable->fTableData;
777 uint32_t tableRowLen = statetable->fRowLen;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000778 #ifdef RBBI_DEBUG
Jungshik Shinb3189662017-11-07 11:18:34 -0800779 if (gTrace) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000780 RBBIDebugPuts("Handle Next pos char state category");
781 }
782 #endif
783
Jungshik Shinb3189662017-11-07 11:18:34 -0800784 // handleNext alway sets the break tag value.
785 // Set the default for it.
786 fRuleStatusIndex = 0;
787
788 fDictionaryCharCount = 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000789
790 // if we're already at the end of the text, return DONE.
Jungshik Shinb3189662017-11-07 11:18:34 -0800791 initialPosition = fPosition;
792 UTEXT_SETNATIVEINDEX(fText, initialPosition);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000793 result = initialPosition;
794 c = UTEXT_NEXT32(fText);
Jungshik Shinb3189662017-11-07 11:18:34 -0800795 if (c==U_SENTINEL) {
796 fDone = TRUE;
797 return UBRK_DONE;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000798 }
799
800 // Set the initial state for the state machine
801 state = START_STATE;
802 row = (RBBIStateTableRow *)
803 //(statetable->fTableData + (statetable->fRowLen * state));
804 (tableData + tableRowLen * state);
Jungshik Shinb3189662017-11-07 11:18:34 -0800805
806
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000807 mode = RBBI_RUN;
808 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
809 category = 2;
810 mode = RBBI_START;
811 }
812
813
814 // loop until we reach the end of the text or transition to state 0
815 //
816 for (;;) {
817 if (c == U_SENTINEL) {
818 // Reached end of input string.
819 if (mode == RBBI_END) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800820 // We have already run the loop one last time with the
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000821 // character set to the psueudo {eof} value. Now it is time
822 // to unconditionally bail out.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000823 break;
824 }
825 // Run the loop one last time with the fake end-of-input character category.
826 mode = RBBI_END;
827 category = 1;
828 }
829
830 //
831 // Get the char category. An incoming category of 1 or 2 means that
832 // we are preset for doing the beginning or end of input, and
833 // that we shouldn't get a category from an actual text input character.
834 //
835 if (mode == RBBI_RUN) {
836 // look up the current character's character category, which tells us
837 // which column in the state table to look at.
838 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
839 // not the size of the character going in, which is a UChar32.
840 //
Jungshik Shinb3189662017-11-07 11:18:34 -0800841 category = UTRIE2_GET16(fData->fTrie, c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000842
843 // Check the dictionary bit in the character's category.
Jungshik Shinb3189662017-11-07 11:18:34 -0800844 // Counter is only used by dictionary based iteration.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000845 // Chars that need to be handled by a dictionary have a flag bit set
846 // in their category values.
847 //
848 if ((category & 0x4000) != 0) {
849 fDictionaryCharCount++;
850 // And off the dictionary flag bit.
851 category &= ~0x4000;
852 }
853 }
854
855 #ifdef RBBI_DEBUG
Jungshik Shinb3189662017-11-07 11:18:34 -0800856 if (gTrace) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000857 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText));
858 if (0x20<=c && c<0x7f) {
859 RBBIDebugPrintf("\"%c\" ", c);
860 } else {
861 RBBIDebugPrintf("%5x ", c);
862 }
863 RBBIDebugPrintf("%3d %3d\n", state, category);
864 }
865 #endif
866
867 // State Transition - move machine to its next state
868 //
869
870 // Note: fNextState is defined as uint16_t[2], but we are casting
871 // a generated RBBI table to RBBIStateTableRow and some tables
872 // actually have more than 2 categories.
873 U_ASSERT(category<fData->fHeader->fCatCount);
874 state = row->fNextState[category]; /*Not accessing beyond memory*/
875 row = (RBBIStateTableRow *)
876 // (statetable->fTableData + (statetable->fRowLen * state));
877 (tableData + tableRowLen * state);
878
879
880 if (row->fAccepting == -1) {
881 // Match found, common case.
882 if (mode != RBBI_START) {
883 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
884 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800885 fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000886 }
887
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700888 int16_t completedRule = row->fAccepting;
889 if (completedRule > 0) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800890 // Lookahead match is completed.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700891 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
892 if (lookaheadResult >= 0) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800893 fRuleStatusIndex = row->fTagIdx;
894 fPosition = lookaheadResult;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700895 return lookaheadResult;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000896 }
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700897 }
898 int16_t rule = row->fLookAhead;
899 if (rule != 0) {
900 // At the position of a '/' in a look-ahead match. Record it.
901 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
902 lookAheadMatches.setPosition(rule, pos);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000903 }
904
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000905 if (state == STOP_STATE) {
906 // This is the normal exit from the lookup state machine.
907 // We have advanced through the string until it is certain that no
908 // longer match is possible, no matter what characters follow.
909 break;
910 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800911
912 // Advance to the next character.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000913 // If this is a beginning-of-input loop iteration, don't advance
914 // the input position. The next iteration will be processing the
915 // first real input character.
916 if (mode == RBBI_RUN) {
917 c = UTEXT_NEXT32(fText);
918 } else {
919 if (mode == RBBI_START) {
920 mode = RBBI_RUN;
921 }
922 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000923 }
924
925 // The state machine is done. Check whether it found a match...
926
927 // If the iterator failed to advance in the match engine, force it ahead by one.
928 // (This really indicates a defect in the break rules. They should always match
929 // at least one character.)
930 if (result == initialPosition) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800931 utext_setNativeIndex(fText, initialPosition);
932 utext_next32(fText);
933 result = (int32_t)utext_getNativeIndex(fText);
934 fRuleStatusIndex = 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000935 }
936
937 // Leave the iterator at our result position.
Jungshik Shinb3189662017-11-07 11:18:34 -0800938 fPosition = result;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000939 #ifdef RBBI_DEBUG
Jungshik Shinb3189662017-11-07 11:18:34 -0800940 if (gTrace) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000941 RBBIDebugPrintf("result = %d\n\n", result);
942 }
943 #endif
944 return result;
945}
946
947
948
949//-----------------------------------------------------------------------------------
950//
951// handlePrevious()
952//
Jungshik Shinb3189662017-11-07 11:18:34 -0800953// Iterate backwards using the safe reverse rules.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000954// The logic of this function is very similar to handleNext(), above.
955//
956//-----------------------------------------------------------------------------------
Jungshik Shinb3189662017-11-07 11:18:34 -0800957int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000958 int32_t state;
959 uint16_t category = 0;
960 RBBIRunMode mode;
961 RBBIStateTableRow *row;
962 UChar32 c;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700963 LookAheadResults lookAheadMatches;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000964 int32_t result = 0;
965 int32_t initialPosition = 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000966
Jungshik Shinb3189662017-11-07 11:18:34 -0800967 const RBBIStateTable *stateTable = fData->fSafeRevTable;
968 UTEXT_SETNATIVEINDEX(fText, fromPosition);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000969 #ifdef RBBI_DEBUG
Jungshik Shinb3189662017-11-07 11:18:34 -0800970 if (gTrace) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000971 RBBIDebugPuts("Handle Previous pos char state category");
972 }
973 #endif
974
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000975 // if we're already at the start of the text, return DONE.
976 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
977 return BreakIterator::DONE;
978 }
979
980 // Set up the starting char.
981 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
982 result = initialPosition;
983 c = UTEXT_PREVIOUS32(fText);
984
985 // Set the initial state for the state machine
986 state = START_STATE;
987 row = (RBBIStateTableRow *)
Jungshik Shinb3189662017-11-07 11:18:34 -0800988 (stateTable->fTableData + (stateTable->fRowLen * state));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000989 category = 3;
990 mode = RBBI_RUN;
Jungshik Shinb3189662017-11-07 11:18:34 -0800991 if (stateTable->fFlags & RBBI_BOF_REQUIRED) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000992 category = 2;
993 mode = RBBI_START;
994 }
995
996
997 // loop until we reach the start of the text or transition to state 0
998 //
999 for (;;) {
1000 if (c == U_SENTINEL) {
1001 // Reached end of input string.
1002 if (mode == RBBI_END) {
Jungshik Shinb3189662017-11-07 11:18:34 -08001003 // We have already run the loop one last time with the
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001004 // character set to the psueudo {eof} value. Now it is time
1005 // to unconditionally bail out.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001006 break;
1007 }
1008 // Run the loop one last time with the fake end-of-input character category.
1009 mode = RBBI_END;
1010 category = 1;
1011 }
1012
1013 //
1014 // Get the char category. An incoming category of 1 or 2 means that
1015 // we are preset for doing the beginning or end of input, and
1016 // that we shouldn't get a category from an actual text input character.
1017 //
1018 if (mode == RBBI_RUN) {
1019 // look up the current character's character category, which tells us
1020 // which column in the state table to look at.
1021 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1022 // not the size of the character going in, which is a UChar32.
1023 //
Jungshik Shinb3189662017-11-07 11:18:34 -08001024 // And off the dictionary flag bit. For reverse iteration it is not used.
1025 category = UTRIE2_GET16(fData->fTrie, c);
1026 category &= ~0x4000;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001027 }
1028
1029 #ifdef RBBI_DEBUG
Jungshik Shinb3189662017-11-07 11:18:34 -08001030 if (gTrace) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001031 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
1032 if (0x20<=c && c<0x7f) {
1033 RBBIDebugPrintf("\"%c\" ", c);
1034 } else {
1035 RBBIDebugPrintf("%5x ", c);
1036 }
1037 RBBIDebugPrintf("%3d %3d\n", state, category);
1038 }
1039 #endif
1040
1041 // State Transition - move machine to its next state
1042 //
1043
1044 // Note: fNextState is defined as uint16_t[2], but we are casting
1045 // a generated RBBI table to RBBIStateTableRow and some tables
1046 // actually have more than 2 categories.
1047 U_ASSERT(category<fData->fHeader->fCatCount);
1048 state = row->fNextState[category]; /*Not accessing beyond memory*/
1049 row = (RBBIStateTableRow *)
Jungshik Shinb3189662017-11-07 11:18:34 -08001050 (stateTable->fTableData + (stateTable->fRowLen * state));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001051
1052 if (row->fAccepting == -1) {
1053 // Match found, common case.
1054 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1055 }
1056
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001057 int16_t completedRule = row->fAccepting;
1058 if (completedRule > 0) {
Jungshik Shinb3189662017-11-07 11:18:34 -08001059 // Lookahead match is completed.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001060 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1061 if (lookaheadResult >= 0) {
1062 UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1063 return lookaheadResult;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001064 }
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001065 }
1066 int16_t rule = row->fLookAhead;
1067 if (rule != 0) {
1068 // At the position of a '/' in a look-ahead match. Record it.
1069 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1070 lookAheadMatches.setPosition(rule, pos);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001071 }
1072
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001073 if (state == STOP_STATE) {
1074 // This is the normal exit from the lookup state machine.
1075 // We have advanced through the string until it is certain that no
1076 // longer match is possible, no matter what characters follow.
1077 break;
1078 }
1079
Jungshik Shinb3189662017-11-07 11:18:34 -08001080 // Move (backwards) to the next character to process.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001081 // If this is a beginning-of-input loop iteration, don't advance
1082 // the input position. The next iteration will be processing the
1083 // first real input character.
1084 if (mode == RBBI_RUN) {
1085 c = UTEXT_PREVIOUS32(fText);
Jungshik Shinb3189662017-11-07 11:18:34 -08001086 } else {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001087 if (mode == RBBI_START) {
1088 mode = RBBI_RUN;
1089 }
1090 }
1091 }
1092
1093 // The state machine is done. Check whether it found a match...
1094
1095 // If the iterator failed to advance in the match engine, force it ahead by one.
1096 // (This really indicates a defect in the break rules. They should always match
1097 // at least one character.)
1098 if (result == initialPosition) {
1099 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1100 UTEXT_PREVIOUS32(fText);
1101 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1102 }
1103
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001104 #ifdef RBBI_DEBUG
Jungshik Shinb3189662017-11-07 11:18:34 -08001105 if (gTrace) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001106 RBBIDebugPrintf("result = %d\n\n", result);
1107 }
1108 #endif
1109 return result;
1110}
1111
1112
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001113//-------------------------------------------------------------------------------
1114//
1115// getRuleStatus() Return the break rule tag associated with the current
1116// iterator position. If the iterator arrived at its current
1117// position by iterating forwards, the value will have been
1118// cached by the handleNext() function.
1119//
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001120//-------------------------------------------------------------------------------
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001121
1122int32_t RuleBasedBreakIterator::getRuleStatus() const {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001123
1124 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1125 // (the number of status values.)
1126 // This function returns the last (largest) of the array of status values.
Jungshik Shinb3189662017-11-07 11:18:34 -08001127 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001128 int32_t tagVal = fData->fRuleStatusTable[idx];
1129
1130 return tagVal;
1131}
1132
1133
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001134int32_t RuleBasedBreakIterator::getRuleStatusVec(
Jungshik Shinb3189662017-11-07 11:18:34 -08001135 int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001136 if (U_FAILURE(status)) {
1137 return 0;
1138 }
1139
Jungshik Shinb3189662017-11-07 11:18:34 -08001140 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001141 int32_t numValsToCopy = numVals;
1142 if (numVals > capacity) {
1143 status = U_BUFFER_OVERFLOW_ERROR;
1144 numValsToCopy = capacity;
1145 }
1146 int i;
1147 for (i=0; i<numValsToCopy; i++) {
Jungshik Shinb3189662017-11-07 11:18:34 -08001148 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001149 }
1150 return numVals;
1151}
1152
1153
1154
1155//-------------------------------------------------------------------------------
1156//
1157// getBinaryRules Access to the compiled form of the rules,
1158// for use by build system tools that save the data
1159// for standard iterator types.
1160//
1161//-------------------------------------------------------------------------------
1162const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1163 const uint8_t *retPtr = NULL;
1164 length = 0;
1165
1166 if (fData != NULL) {
1167 retPtr = (const uint8_t *)fData->fHeader;
1168 length = fData->fHeader->fLength;
1169 }
1170 return retPtr;
1171}
1172
1173
1174BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
1175 int32_t &bufferSize,
1176 UErrorCode &status)
1177{
1178 if (U_FAILURE(status)){
1179 return NULL;
1180 }
1181
1182 if (bufferSize == 0) {
1183 bufferSize = 1; // preflighting for deprecated functionality
1184 return NULL;
1185 }
1186
1187 BreakIterator *clonedBI = clone();
1188 if (clonedBI == NULL) {
1189 status = U_MEMORY_ALLOCATION_ERROR;
1190 } else {
1191 status = U_SAFECLONE_ALLOCATED_WARNING;
1192 }
1193 return (RuleBasedBreakIterator *)clonedBI;
1194}
1195
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001196U_NAMESPACE_END
1197
1198
Jungshik Shinb3189662017-11-07 11:18:34 -08001199static icu::UStack *gLanguageBreakFactories = nullptr;
1200static const icu::UnicodeString *gEmptyString = nullptr;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001201static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
Jungshik Shinb3189662017-11-07 11:18:34 -08001202static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001203
1204/**
Jungshik Shinb3189662017-11-07 11:18:34 -08001205 * Release all static memory held by breakiterator.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001206 */
1207U_CDECL_BEGIN
Jungshik Shinb3189662017-11-07 11:18:34 -08001208static UBool U_CALLCONV rbbi_cleanup(void) {
1209 delete gLanguageBreakFactories;
1210 gLanguageBreakFactories = nullptr;
1211 delete gEmptyString;
1212 gEmptyString = nullptr;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001213 gLanguageBreakFactoriesInitOnce.reset();
Jungshik Shinb3189662017-11-07 11:18:34 -08001214 gRBBIInitOnce.reset();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001215 return TRUE;
1216}
1217U_CDECL_END
1218
1219U_CDECL_BEGIN
1220static void U_CALLCONV _deleteFactory(void *obj) {
1221 delete (icu::LanguageBreakFactory *) obj;
1222}
1223U_CDECL_END
1224U_NAMESPACE_BEGIN
1225
Jungshik Shinb3189662017-11-07 11:18:34 -08001226static void U_CALLCONV rbbiInit() {
1227 gEmptyString = new UnicodeString();
1228 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1229}
1230
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001231static void U_CALLCONV initLanguageFactories() {
1232 UErrorCode status = U_ZERO_ERROR;
1233 U_ASSERT(gLanguageBreakFactories == NULL);
1234 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1235 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1236 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1237 gLanguageBreakFactories->push(builtIn, status);
1238#ifdef U_LOCAL_SERVICE_HOOK
1239 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1240 if (extra != NULL) {
1241 gLanguageBreakFactories->push(extra, status);
1242 }
1243#endif
1244 }
Jungshik Shinb3189662017-11-07 11:18:34 -08001245 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001246}
1247
1248
1249static const LanguageBreakEngine*
1250getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1251{
1252 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1253 if (gLanguageBreakFactories == NULL) {
1254 return NULL;
1255 }
Jungshik Shinb3189662017-11-07 11:18:34 -08001256
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001257 int32_t i = gLanguageBreakFactories->size();
1258 const LanguageBreakEngine *lbe = NULL;
1259 while (--i >= 0) {
1260 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1261 lbe = factory->getEngineFor(c, breakType);
1262 if (lbe != NULL) {
1263 break;
1264 }
1265 }
1266 return lbe;
1267}
1268
1269
1270//-------------------------------------------------------------------------------
1271//
1272// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1273// the character c.
1274//
1275//-------------------------------------------------------------------------------
1276const LanguageBreakEngine *
1277RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1278 const LanguageBreakEngine *lbe = NULL;
1279 UErrorCode status = U_ZERO_ERROR;
Jungshik Shinb3189662017-11-07 11:18:34 -08001280
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001281 if (fLanguageBreakEngines == NULL) {
1282 fLanguageBreakEngines = new UStack(status);
1283 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1284 delete fLanguageBreakEngines;
1285 fLanguageBreakEngines = 0;
1286 return NULL;
1287 }
1288 }
Jungshik Shinb3189662017-11-07 11:18:34 -08001289
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001290 int32_t i = fLanguageBreakEngines->size();
1291 while (--i >= 0) {
1292 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1293 if (lbe->handles(c, fBreakType)) {
1294 return lbe;
1295 }
1296 }
Jungshik Shinb3189662017-11-07 11:18:34 -08001297
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001298 // No existing dictionary took the character. See if a factory wants to
1299 // give us a new LanguageBreakEngine for this character.
1300 lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
Jungshik Shinb3189662017-11-07 11:18:34 -08001301
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001302 // If we got one, use it and push it on our stack.
1303 if (lbe != NULL) {
1304 fLanguageBreakEngines->push((void *)lbe, status);
1305 // Even if we can't remember it, we can keep looking it up, so
1306 // return it even if the push fails.
1307 return lbe;
1308 }
Jungshik Shinb3189662017-11-07 11:18:34 -08001309
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001310 // No engine is forthcoming for this character. Add it to the
1311 // reject set. Create the reject break engine if needed.
1312 if (fUnhandledBreakEngine == NULL) {
1313 fUnhandledBreakEngine = new UnhandledEngine(status);
1314 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1315 status = U_MEMORY_ALLOCATION_ERROR;
1316 }
1317 // Put it last so that scripts for which we have an engine get tried
1318 // first.
1319 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1320 // If we can't insert it, or creation failed, get rid of it
1321 if (U_FAILURE(status)) {
1322 delete fUnhandledBreakEngine;
1323 fUnhandledBreakEngine = 0;
1324 return NULL;
1325 }
1326 }
Jungshik Shinb3189662017-11-07 11:18:34 -08001327
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001328 // Tell the reject engine about the character; at its discretion, it may
1329 // add more than just the one character.
1330 fUnhandledBreakEngine->handleCharacter(c, fBreakType);
Jungshik Shinb3189662017-11-07 11:18:34 -08001331
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001332 return fUnhandledBreakEngine;
1333}
1334
1335
1336
1337/*int32_t RuleBasedBreakIterator::getBreakType() const {
1338 return fBreakType;
1339}*/
1340
1341void RuleBasedBreakIterator::setBreakType(int32_t type) {
1342 fBreakType = type;
Jungshik Shinb3189662017-11-07 11:18:34 -08001343}
1344
1345void RuleBasedBreakIterator::dumpCache() {
1346 fBreakCache->dumpCache();
1347}
1348
1349/**
1350 * Returns the description used to create this iterator
1351 */
1352
1353const UnicodeString&
1354RuleBasedBreakIterator::getRules() const {
1355 if (fData != NULL) {
1356 return fData->getRuleSourceString();
1357 } else {
1358 umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1359 return *gEmptyString;
1360 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001361}
1362
1363U_NAMESPACE_END
1364
1365#endif /* #if !UCONFIG_NO_BREAK_ITERATION */