blob: e97eba8d14de3e29a6f498b774481d91694cb97a [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003//
4// rbbisetb.cpp
5//
6/*
7***************************************************************************
8* Copyright (C) 2002-2008 International Business Machines Corporation *
9* and others. All rights reserved. *
10***************************************************************************
11*/
12//
13// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
14// (part of the rule building process.)
15//
16// Starting with the rules parse tree from the scanner,
17//
18// - Enumerate the set of UnicodeSets that are referenced
19// by the RBBI rules.
20// - compute a set of non-overlapping character ranges
21// with all characters within a range belonging to the same
22// set of input uniocde sets.
23// - Derive a set of non-overlapping UnicodeSet (like things)
24// that will correspond to columns in the state table for
25// the RBBI execution engine. All characters within one
26// of these sets belong to the same set of the original
27// UnicodeSets from the user's rules.
28// - construct the trie table that maps input characters
29// to the index of the matching non-overlapping set of set from
30// the previous step.
31//
32
33#include "unicode/utypes.h"
34
35#if !UCONFIG_NO_BREAK_ITERATION
36
37#include "unicode/uniset.h"
Jungshik Shinb3189662017-11-07 11:18:34 -080038#include "utrie2.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000039#include "uvector.h"
40#include "uassert.h"
41#include "cmemory.h"
42#include "cstring.h"
43
44#include "rbbisetb.h"
45#include "rbbinode.h"
46
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000047U_NAMESPACE_BEGIN
48
49//------------------------------------------------------------------------
50//
51// Constructor
52//
53//------------------------------------------------------------------------
54RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
55{
56 fRB = rb;
57 fStatus = rb->fStatus;
58 fRangeList = 0;
59 fTrie = 0;
60 fTrieSize = 0;
61 fGroupCount = 0;
62 fSawBOF = FALSE;
63}
64
65
66//------------------------------------------------------------------------
67//
68// Destructor
69//
70//------------------------------------------------------------------------
71RBBISetBuilder::~RBBISetBuilder()
72{
73 RangeDescriptor *nextRangeDesc;
74
75 // Walk through & delete the linked list of RangeDescriptors
76 for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
77 RangeDescriptor *r = nextRangeDesc;
78 nextRangeDesc = r->fNext;
79 delete r;
80 }
81
Jungshik Shinb3189662017-11-07 11:18:34 -080082 utrie2_close(fTrie);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000083}
84
85
86
87
88//------------------------------------------------------------------------
89//
90// build Build the list of non-overlapping character ranges
91// from the Unicode Sets.
92//
93//------------------------------------------------------------------------
94void RBBISetBuilder::build() {
95 RBBINode *usetNode;
96 RangeDescriptor *rlRange;
97
98 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
99
100 //
101 // Initialize the process by creating a single range encompassing all characters
102 // that is in no sets.
103 //
104 fRangeList = new RangeDescriptor(*fStatus); // will check for status here
105 if (fRangeList == NULL) {
106 *fStatus = U_MEMORY_ALLOCATION_ERROR;
107 return;
108 }
109 fRangeList->fStartChar = 0;
110 fRangeList->fEndChar = 0x10ffff;
111
112 if (U_FAILURE(*fStatus)) {
113 return;
114 }
115
116 //
117 // Find the set of non-overlapping ranges of characters
118 //
119 int ni;
120 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
121 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
122 if (usetNode==NULL) {
123 break;
124 }
125
126 UnicodeSet *inputSet = usetNode->fInputSet;
127 int32_t inputSetRangeCount = inputSet->getRangeCount();
128 int inputSetRangeIndex = 0;
129 rlRange = fRangeList;
130
131 for (;;) {
132 if (inputSetRangeIndex >= inputSetRangeCount) {
133 break;
134 }
135 UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
136 UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
137
138 // skip over ranges from the range list that are completely
139 // below the current range from the input unicode set.
140 while (rlRange->fEndChar < inputSetRangeBegin) {
141 rlRange = rlRange->fNext;
142 }
143
144 // If the start of the range from the range list is before with
145 // the start of the range from the unicode set, split the range list range
146 // in two, with one part being before (wholly outside of) the unicode set
147 // and the other containing the rest.
148 // Then continue the loop; the post-split current range will then be skipped
149 // over
150 if (rlRange->fStartChar < inputSetRangeBegin) {
151 rlRange->split(inputSetRangeBegin, *fStatus);
152 if (U_FAILURE(*fStatus)) {
153 return;
154 }
155 continue;
156 }
157
158 // Same thing at the end of the ranges...
159 // If the end of the range from the range list doesn't coincide with
160 // the end of the range from the unicode set, split the range list
161 // range in two. The first part of the split range will be
162 // wholly inside the Unicode set.
163 if (rlRange->fEndChar > inputSetRangeEnd) {
164 rlRange->split(inputSetRangeEnd+1, *fStatus);
165 if (U_FAILURE(*fStatus)) {
166 return;
167 }
168 }
169
170 // The current rlRange is now entirely within the UnicodeSet range.
171 // Add this unicode set to the list of sets for this rlRange
172 if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
173 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
174 if (U_FAILURE(*fStatus)) {
175 return;
176 }
177 }
178
179 // Advance over ranges that we are finished with.
180 if (inputSetRangeEnd == rlRange->fEndChar) {
181 inputSetRangeIndex++;
182 }
183 rlRange = rlRange->fNext;
184 }
185 }
186
187 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
188
189 //
190 // Group the above ranges, with each group consisting of one or more
191 // ranges that are in exactly the same set of original UnicodeSets.
192 // The groups are numbered, and these group numbers are the set of
193 // input symbols recognized by the run-time state machine.
194 //
195 // Numbering: # 0 (state table column 0) is unused.
196 // # 1 is reserved - table column 1 is for end-of-input
197 // # 2 is reserved - table column 2 is for beginning-in-input
198 // # 3 is the first range list.
199 //
200 RangeDescriptor *rlSearchRange;
201 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
202 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
203 if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
204 rlRange->fNum = rlSearchRange->fNum;
205 break;
206 }
207 }
208 if (rlRange->fNum == 0) {
209 fGroupCount ++;
210 rlRange->fNum = fGroupCount+2;
211 rlRange->setDictionaryFlag();
212 addValToSets(rlRange->fIncludesSets, fGroupCount+2);
213 }
214 }
215
216 // Handle input sets that contain the special string {eof}.
217 // Column 1 of the state table is reserved for EOF on input.
218 // Column 2 is reserved for before-the-start-input.
219 // (This column can be optimized away later if there are no rule
220 // references to {bof}.)
221 // Add this column value (1 or 2) to the equivalent expression
222 // subtree for each UnicodeSet that contains the string {eof}
223 // Because {bof} and {eof} are not a characters in the normal sense,
224 // they doesn't affect the computation of ranges or TRIE.
225 static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
226 static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
227
228 UnicodeString eofString(eofUString);
229 UnicodeString bofString(bofUString);
230 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
231 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
232 if (usetNode==NULL) {
233 break;
234 }
235 UnicodeSet *inputSet = usetNode->fInputSet;
236 if (inputSet->contains(eofString)) {
237 addValToSet(usetNode, 1);
238 }
239 if (inputSet->contains(bofString)) {
240 addValToSet(usetNode, 2);
241 fSawBOF = TRUE;
242 }
243 }
244
245
246 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
247 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
248
249 //
250 // Build the Trie table for mapping UChar32 values to the corresponding
251 // range group number
252 //
Jungshik Shinb3189662017-11-07 11:18:34 -0800253 fTrie = utrie2_open(0, // Initial value for all code points.
254 0, // Error value for out-of-range input.
255 fStatus);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000256
Jungshik Shinb3189662017-11-07 11:18:34 -0800257 for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
258 utrie2_setRange32(fTrie,
259 rlRange->fStartChar, // Range start
260 rlRange->fEndChar, // Range end (inclusive)
261 rlRange->fNum, // value for range
262 TRUE, // Overwrite previously written values
263 fStatus);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000264 }
265}
266
267
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000268//-----------------------------------------------------------------------------------
269//
270// getTrieSize() Return the size that will be required to serialize the Trie.
271//
272//-----------------------------------------------------------------------------------
Jungshik Shinb3189662017-11-07 11:18:34 -0800273int32_t RBBISetBuilder::getTrieSize() {
274 if (U_FAILURE(*fStatus)) {
275 return 0;
276 }
277 utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
278 fTrieSize = utrie2_serialize(fTrie,
279 NULL, // Buffer
280 0, // Capacity
281 fStatus);
282 if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
283 *fStatus = U_ZERO_ERROR;
284 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000285 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
286 return fTrieSize;
287}
288
289
290//-----------------------------------------------------------------------------------
291//
292// serializeTrie() Put the serialized trie at the specified address.
293// Trust the caller to have given us enough memory.
294// getTrieSize() MUST be called first.
295//
296//-----------------------------------------------------------------------------------
297void RBBISetBuilder::serializeTrie(uint8_t *where) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800298 utrie2_serialize(fTrie,
299 where, // Buffer
300 fTrieSize, // Capacity
301 fStatus);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000302}
303
304//------------------------------------------------------------------------
305//
306// addValToSets Add a runtime-mapped input value to each uset from a
307// list of uset nodes. (val corresponds to a state table column.)
308// For each of the original Unicode sets - which correspond
309// directly to uset nodes - a logically equivalent expression
310// is constructed in terms of the remapped runtime input
311// symbol set. This function adds one runtime input symbol to
312// a list of sets.
313//
314// The "logically equivalent expression" is the tree for an
315// or-ing together of all of the symbols that go into the set.
316//
317//------------------------------------------------------------------------
318void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
319 int32_t ix;
320
321 for (ix=0; ix<sets->size(); ix++) {
322 RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
323 addValToSet(usetNode, val);
324 }
325}
326
327void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
328 RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
329 if (leafNode == NULL) {
330 *fStatus = U_MEMORY_ALLOCATION_ERROR;
331 return;
332 }
333 leafNode->fVal = (unsigned short)val;
334 if (usetNode->fLeftChild == NULL) {
335 usetNode->fLeftChild = leafNode;
336 leafNode->fParent = usetNode;
337 } else {
338 // There are already input symbols present for this set.
339 // Set up an OR node, with the previous stuff as the left child
340 // and the new value as the right child.
341 RBBINode *orNode = new RBBINode(RBBINode::opOr);
342 if (orNode == NULL) {
343 *fStatus = U_MEMORY_ALLOCATION_ERROR;
344 return;
345 }
346 orNode->fLeftChild = usetNode->fLeftChild;
347 orNode->fRightChild = leafNode;
348 orNode->fLeftChild->fParent = orNode;
349 orNode->fRightChild->fParent = orNode;
350 usetNode->fLeftChild = orNode;
351 orNode->fParent = usetNode;
352 }
353}
354
355
356//------------------------------------------------------------------------
357//
358// getNumCharCategories
359//
360//------------------------------------------------------------------------
361int32_t RBBISetBuilder::getNumCharCategories() const {
362 return fGroupCount + 3;
363}
364
365
366//------------------------------------------------------------------------
367//
368// sawBOF
369//
370//------------------------------------------------------------------------
371UBool RBBISetBuilder::sawBOF() const {
372 return fSawBOF;
373}
374
375
376//------------------------------------------------------------------------
377//
378// getFirstChar Given a runtime RBBI character category, find
379// the first UChar32 that is in the set of chars
380// in the category.
381//------------------------------------------------------------------------
382UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
383 RangeDescriptor *rlRange;
384 UChar32 retVal = (UChar32)-1;
385 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
386 if (rlRange->fNum == category) {
387 retVal = rlRange->fStartChar;
388 break;
389 }
390 }
391 return retVal;
392}
393
394
395
396//------------------------------------------------------------------------
397//
398// printRanges A debugging function.
399// dump out all of the range definitions.
400//
401//------------------------------------------------------------------------
402#ifdef RBBI_DEBUG
403void RBBISetBuilder::printRanges() {
404 RangeDescriptor *rlRange;
405 int i;
406
407 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
408 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
409 RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
410
411 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
412 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
413 UnicodeString setName = UNICODE_STRING("anon", 4);
414 RBBINode *setRef = usetNode->fParent;
415 if (setRef != NULL) {
416 RBBINode *varRef = setRef->fParent;
417 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
418 setName = varRef->fText;
419 }
420 }
421 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
422 }
423 RBBIDebugPrintf("\n");
424 }
425}
426#endif
427
428
429//------------------------------------------------------------------------
430//
431// printRangeGroups A debugging function.
432// dump out all of the range groups.
433//
434//------------------------------------------------------------------------
435#ifdef RBBI_DEBUG
436void RBBISetBuilder::printRangeGroups() {
437 RangeDescriptor *rlRange;
438 RangeDescriptor *tRange;
439 int i;
440 int lastPrintedGroupNum = 0;
441
442 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
443 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
444 int groupNum = rlRange->fNum & 0xbfff;
445 if (groupNum > lastPrintedGroupNum) {
446 lastPrintedGroupNum = groupNum;
447 RBBIDebugPrintf("%2i ", groupNum);
448
449 if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
450
451 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
452 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
453 UnicodeString setName = UNICODE_STRING("anon", 4);
454 RBBINode *setRef = usetNode->fParent;
455 if (setRef != NULL) {
456 RBBINode *varRef = setRef->fParent;
457 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
458 setName = varRef->fText;
459 }
460 }
461 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
462 }
463
464 i = 0;
465 for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
466 if (tRange->fNum == rlRange->fNum) {
467 if (i++ % 5 == 0) {
468 RBBIDebugPrintf("\n ");
469 }
470 RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
471 }
472 }
473 RBBIDebugPrintf("\n");
474 }
475 }
476 RBBIDebugPrintf("\n");
477}
478#endif
479
480
481//------------------------------------------------------------------------
482//
483// printSets A debugging function.
484// dump out all of the set definitions.
485//
486//------------------------------------------------------------------------
487#ifdef RBBI_DEBUG
488void RBBISetBuilder::printSets() {
489 int i;
490
491 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
492 for (i=0; ; i++) {
493 RBBINode *usetNode;
494 RBBINode *setRef;
495 RBBINode *varRef;
496 UnicodeString setName;
497
498 usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
499 if (usetNode == NULL) {
500 break;
501 }
502
503 RBBIDebugPrintf("%3d ", i);
504 setName = UNICODE_STRING("anonymous", 9);
505 setRef = usetNode->fParent;
506 if (setRef != NULL) {
507 varRef = setRef->fParent;
508 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
509 setName = varRef->fText;
510 }
511 }
512 RBBI_DEBUG_printUnicodeString(setName);
513 RBBIDebugPrintf(" ");
514 RBBI_DEBUG_printUnicodeString(usetNode->fText);
515 RBBIDebugPrintf("\n");
516 if (usetNode->fLeftChild != NULL) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700517 RBBINode::printTree(usetNode->fLeftChild, TRUE);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000518 }
519 }
520 RBBIDebugPrintf("\n");
521}
522#endif
523
524
525
526//-------------------------------------------------------------------------------------
527//
528// RangeDescriptor copy constructor
529//
530//-------------------------------------------------------------------------------------
531
532RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
533 int i;
534
535 this->fStartChar = other.fStartChar;
536 this->fEndChar = other.fEndChar;
537 this->fNum = other.fNum;
538 this->fNext = NULL;
539 UErrorCode oldstatus = status;
540 this->fIncludesSets = new UVector(status);
541 if (U_FAILURE(oldstatus)) {
542 status = oldstatus;
543 }
544 if (U_FAILURE(status)) {
545 return;
546 }
547 /* test for NULL */
548 if (this->fIncludesSets == 0) {
549 status = U_MEMORY_ALLOCATION_ERROR;
550 return;
551 }
552
553 for (i=0; i<other.fIncludesSets->size(); i++) {
554 this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
555 }
556}
557
558
559//-------------------------------------------------------------------------------------
560//
561// RangeDesriptor default constructor
562//
563//-------------------------------------------------------------------------------------
564RangeDescriptor::RangeDescriptor(UErrorCode &status) {
565 this->fStartChar = 0;
566 this->fEndChar = 0;
567 this->fNum = 0;
568 this->fNext = NULL;
569 UErrorCode oldstatus = status;
570 this->fIncludesSets = new UVector(status);
571 if (U_FAILURE(oldstatus)) {
572 status = oldstatus;
573 }
574 if (U_FAILURE(status)) {
575 return;
576 }
577 /* test for NULL */
578 if(this->fIncludesSets == 0) {
579 status = U_MEMORY_ALLOCATION_ERROR;
580 return;
581 }
582
583}
584
585
586//-------------------------------------------------------------------------------------
587//
588// RangeDesriptor Destructor
589//
590//-------------------------------------------------------------------------------------
591RangeDescriptor::~RangeDescriptor() {
592 delete fIncludesSets;
593 fIncludesSets = NULL;
594}
595
596//-------------------------------------------------------------------------------------
597//
598// RangeDesriptor::split()
599//
600//-------------------------------------------------------------------------------------
601void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
602 U_ASSERT(where>fStartChar && where<=fEndChar);
603 RangeDescriptor *nr = new RangeDescriptor(*this, status);
604 if(nr == 0) {
605 status = U_MEMORY_ALLOCATION_ERROR;
606 return;
607 }
608 if (U_FAILURE(status)) {
609 delete nr;
610 return;
611 }
612 // RangeDescriptor copy constructor copies all fields.
613 // Only need to update those that are different after the split.
614 nr->fStartChar = where;
615 this->fEndChar = where-1;
616 nr->fNext = this->fNext;
617 this->fNext = nr;
618}
619
620
621//-------------------------------------------------------------------------------------
622//
623// RangeDescriptor::setDictionaryFlag
624//
625// Character Category Numbers that include characters from
626// the original Unicode Set named "dictionary" have bit 14
627// set to 1. The RBBI runtime engine uses this to trigger
628// use of the word dictionary.
629//
630// This function looks through the Unicode Sets that it
631// (the range) includes, and sets the bit in fNum when
632// "dictionary" is among them.
633//
634// TODO: a faster way would be to find the set node for
635// "dictionary" just once, rather than looking it
636// up by name every time.
637//
638//-------------------------------------------------------------------------------------
639void RangeDescriptor::setDictionaryFlag() {
640 int i;
641
642 for (i=0; i<this->fIncludesSets->size(); i++) {
643 RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
644 UnicodeString setName;
645 RBBINode *setRef = usetNode->fParent;
646 if (setRef != NULL) {
647 RBBINode *varRef = setRef->fParent;
648 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
649 setName = varRef->fText;
650 }
651 }
652 if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
653 this->fNum |= 0x4000;
654 break;
655 }
656 }
657}
658
659
660
661U_NAMESPACE_END
662
663#endif /* #if !UCONFIG_NO_BREAK_ITERATION */