blob: 1dc6b704d01094c7233e9d3d5f3b19566b7d5b3e [file] [log] [blame]
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001//
2// file: rbbiscan.cpp
3//
Jungshik Shin70f82502016-01-29 00:32:36 -08004// Copyright (C) 2002-2015, International Business Machines Corporation and others.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00005// All Rights Reserved.
6//
7// This file contains the Rule Based Break Iterator Rule Builder functions for
8// scanning the rules and assembling a parse tree. This is the first phase
9// of compiling the rules.
10//
11// The overall of the rules is managed by class RBBIRuleBuilder, which will
12// create and use an instance of this class as part of the process.
13//
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_BREAK_ITERATION
18
19#include "unicode/unistr.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/uchriter.h"
23#include "unicode/parsepos.h"
24#include "unicode/parseerr.h"
25#include "cmemory.h"
26#include "cstring.h"
27
28#include "rbbirpt.h" // Contains state table for the rbbi rules parser.
29 // generated by a Perl script.
30#include "rbbirb.h"
31#include "rbbinode.h"
32#include "rbbiscan.h"
33#include "rbbitblb.h"
34
35#include "uassert.h"
36
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000037//------------------------------------------------------------------------------
38//
39// Unicode Set init strings for each of the character classes needed for parsing a rule file.
40// (Initialized with hex values for portability to EBCDIC based machines.
41// Really ugly, but there's no good way to avoid it.)
42//
43// The sets are referred to by name in the rbbirpt.txt, which is the
44// source form of the state transition table for the RBBI rule parser.
45//
46//------------------------------------------------------------------------------
47static const UChar gRuleSet_rule_char_pattern[] = {
48 // [ ^ [ \ p { Z } \ u 0 0 2 0
49 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
50 // - \ u 0 0 7 f ] - [ \ p
51 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
52 // { L } ] - [ \ p { N } ] ]
53 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
54
55static const UChar gRuleSet_name_char_pattern[] = {
56// [ _ \ p { L } \ p { N } ]
57 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
58
59static const UChar gRuleSet_digit_char_pattern[] = {
60// [ 0 - 9 ]
61 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
62
63static const UChar gRuleSet_name_start_char_pattern[] = {
64// [ _ \ p { L } ]
65 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
66
67static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
68
69
70U_CDECL_BEGIN
71static void U_CALLCONV RBBISetTable_deleter(void *p) {
72 icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
73 delete px->key;
74 // Note: px->val is owned by the linked list "fSetsListHead" in scanner.
75 // Don't delete the value nodes here.
76 uprv_free(px);
77}
78U_CDECL_END
79
80U_NAMESPACE_BEGIN
81
82//------------------------------------------------------------------------------
83//
84// Constructor.
85//
86//------------------------------------------------------------------------------
87RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
88{
89 fRB = rb;
90 fStackPtr = 0;
91 fStack[fStackPtr] = 0;
92 fNodeStackPtr = 0;
93 fRuleNum = 0;
94 fNodeStack[0] = NULL;
95
96 fSymbolTable = NULL;
97 fSetTable = NULL;
98
99 fScanIndex = 0;
100 fNextIndex = 0;
101
102 fReverseRule = FALSE;
103 fLookAheadRule = FALSE;
104
105 fLineNum = 1;
106 fCharNum = 0;
107 fQuoteMode = FALSE;
108
109 // Do not check status until after all critical fields are sufficiently initialized
110 // that the destructor can run cleanly.
111 if (U_FAILURE(*rb->fStatus)) {
112 return;
113 }
114
115 //
116 // Set up the constant Unicode Sets.
117 // Note: These could be made static, lazily initialized, and shared among
118 // all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
119 // and the time to build these few sets should be small compared to a
120 // full break iterator build.
121 fRuleSets[kRuleSet_rule_char-128]
122 = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus);
123 // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
124 fRuleSets[kRuleSet_white_space-128].
125 add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
126 fRuleSets[kRuleSet_name_char-128]
127 = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);
128 fRuleSets[kRuleSet_name_start_char-128]
129 = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
130 fRuleSets[kRuleSet_digit_char-128]
131 = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus);
132 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
133 // This case happens if ICU's data is missing. UnicodeSet tries to look up property
134 // names from the init string, can't find them, and claims an illegal argument.
135 // Change the error so that the actual problem will be clearer to users.
136 *rb->fStatus = U_BRK_INIT_ERROR;
137 }
138 if (U_FAILURE(*rb->fStatus)) {
139 return;
140 }
141
142 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
143 if (fSymbolTable == NULL) {
144 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
145 return;
146 }
147 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);
148 if (U_FAILURE(*rb->fStatus)) {
149 return;
150 }
151 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
152}
153
154
155
156//------------------------------------------------------------------------------
157//
158// Destructor
159//
160//------------------------------------------------------------------------------
161RBBIRuleScanner::~RBBIRuleScanner() {
162 delete fSymbolTable;
163 if (fSetTable != NULL) {
164 uhash_close(fSetTable);
165 fSetTable = NULL;
166
167 }
168
169
170 // Node Stack.
171 // Normally has one entry, which is the entire parse tree for the rules.
172 // If errors occured, there may be additional subtrees left on the stack.
173 while (fNodeStackPtr > 0) {
174 delete fNodeStack[fNodeStackPtr];
175 fNodeStackPtr--;
176 }
177
178}
179
180//------------------------------------------------------------------------------
181//
182// doParseAction Do some action during rule parsing.
183// Called by the parse state machine.
184// Actions build the parse tree and Unicode Sets,
185// and maintain the parse stack for nested expressions.
186//
187// TODO: unify EParseAction and RBBI_RuleParseAction enum types.
188// They represent exactly the same thing. They're separate
189// only to work around enum forward declaration restrictions
190// in some compilers, while at the same time avoiding multiple
191// definitions problems. I'm sure that there's a better way.
192//
193//------------------------------------------------------------------------------
194UBool RBBIRuleScanner::doParseActions(int32_t action)
195{
196 RBBINode *n = NULL;
197
198 UBool returnVal = TRUE;
199
200 switch (action) {
201
202 case doExprStart:
203 pushNewNode(RBBINode::opStart);
204 fRuleNum++;
205 break;
206
207
208 case doExprOrOperator:
209 {
210 fixOpStack(RBBINode::precOpCat);
211 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
212 RBBINode *orNode = pushNewNode(RBBINode::opOr);
Jungshik Shin70f82502016-01-29 00:32:36 -0800213 if (U_FAILURE(*fRB->fStatus)) {
214 break;
215 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000216 orNode->fLeftChild = operandNode;
217 operandNode->fParent = orNode;
218 }
219 break;
220
221 case doExprCatOperator:
222 // concatenation operator.
223 // For the implicit concatenation of adjacent terms in an expression that are
224 // not separated by any other operator. Action is invoked between the
225 // actions for the two terms.
226 {
227 fixOpStack(RBBINode::precOpCat);
228 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
229 RBBINode *catNode = pushNewNode(RBBINode::opCat);
Jungshik Shin70f82502016-01-29 00:32:36 -0800230 if (U_FAILURE(*fRB->fStatus)) {
231 break;
232 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000233 catNode->fLeftChild = operandNode;
234 operandNode->fParent = catNode;
235 }
236 break;
237
238 case doLParen:
239 // Open Paren.
240 // The openParen node is a dummy operation type with a low precedence,
241 // which has the affect of ensuring that any real binary op that
242 // follows within the parens binds more tightly to the operands than
243 // stuff outside of the parens.
244 pushNewNode(RBBINode::opLParen);
245 break;
246
247 case doExprRParen:
248 fixOpStack(RBBINode::precLParen);
249 break;
250
251 case doNOP:
252 break;
253
254 case doStartAssign:
255 // We've just scanned "$variable = "
256 // The top of the node stack has the $variable ref node.
257
258 // Save the start position of the RHS text in the StartExpression node
259 // that precedes the $variableReference node on the stack.
260 // This will eventually be used when saving the full $variable replacement
261 // text as a string.
262 n = fNodeStack[fNodeStackPtr-1];
263 n->fFirstPos = fNextIndex; // move past the '='
264
265 // Push a new start-of-expression node; needed to keep parse of the
266 // RHS expression happy.
267 pushNewNode(RBBINode::opStart);
268 break;
269
270
271
272
273 case doEndAssign:
274 {
275 // We have reached the end of an assignement statement.
276 // Current scan char is the ';' that terminates the assignment.
277
278 // Terminate expression, leaves expression parse tree rooted in TOS node.
279 fixOpStack(RBBINode::precStart);
280
281 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];
282 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];
283 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
284
285 // Save original text of right side of assignment, excluding the terminating ';'
286 // in the root of the node for the right-hand-side expression.
287 RHSExprNode->fFirstPos = startExprNode->fFirstPos;
288 RHSExprNode->fLastPos = fScanIndex;
289 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
290
291 // Expression parse tree becomes l. child of the $variable reference node.
292 varRefNode->fLeftChild = RHSExprNode;
293 RHSExprNode->fParent = varRefNode;
294
295 // Make a symbol table entry for the $variableRef node.
296 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
297 if (U_FAILURE(*fRB->fStatus)) {
298 // This is a round-about way to get the parse position set
299 // so that duplicate symbols error messages include a line number.
300 UErrorCode t = *fRB->fStatus;
301 *fRB->fStatus = U_ZERO_ERROR;
302 error(t);
303 }
304
305 // Clean up the stack.
306 delete startExprNode;
307 fNodeStackPtr-=3;
308 break;
309 }
310
311 case doEndOfRule:
312 {
313 fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
314 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node.
315 break;
316 }
317#ifdef RBBI_DEBUG
318 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
319#endif
320 U_ASSERT(fNodeStackPtr == 1);
321
322 // If this rule includes a look-ahead '/', add a endMark node to the
323 // expression tree.
324 if (fLookAheadRule) {
325 RBBINode *thisRule = fNodeStack[fNodeStackPtr];
326 RBBINode *endNode = pushNewNode(RBBINode::endMark);
327 RBBINode *catNode = pushNewNode(RBBINode::opCat);
Jungshik Shin70f82502016-01-29 00:32:36 -0800328 if (U_FAILURE(*fRB->fStatus)) {
329 break;
330 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000331 fNodeStackPtr -= 2;
332 catNode->fLeftChild = thisRule;
333 catNode->fRightChild = endNode;
334 fNodeStack[fNodeStackPtr] = catNode;
335 endNode->fVal = fRuleNum;
336 endNode->fLookAheadEnd = TRUE;
337 }
338
339 // All rule expressions are ORed together.
340 // The ';' that terminates an expression really just functions as a '|' with
341 // a low operator prededence.
342 //
343 // Each of the four sets of rules are collected separately.
344 // (forward, reverse, safe_forward, safe_reverse)
345 // OR this rule into the appropriate group of them.
346 //
347 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
348
349 if (*destRules != NULL) {
350 // This is not the first rule encounted.
351 // OR previous stuff (from *destRules)
352 // with the current rule expression (on the Node Stack)
353 // with the resulting OR expression going to *destRules
354 //
355 RBBINode *thisRule = fNodeStack[fNodeStackPtr];
356 RBBINode *prevRules = *destRules;
357 RBBINode *orNode = pushNewNode(RBBINode::opOr);
Jungshik Shin70f82502016-01-29 00:32:36 -0800358 if (U_FAILURE(*fRB->fStatus)) {
359 break;
360 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000361 orNode->fLeftChild = prevRules;
362 prevRules->fParent = orNode;
363 orNode->fRightChild = thisRule;
364 thisRule->fParent = orNode;
365 *destRules = orNode;
366 }
367 else
368 {
369 // This is the first rule encountered (for this direction).
370 // Just move its parse tree from the stack to *destRules.
371 *destRules = fNodeStack[fNodeStackPtr];
372 }
373 fReverseRule = FALSE; // in preparation for the next rule.
374 fLookAheadRule = FALSE;
375 fNodeStackPtr = 0;
376 }
377 break;
378
379
380 case doRuleError:
381 error(U_BRK_RULE_SYNTAX);
382 returnVal = FALSE;
383 break;
384
385
386 case doVariableNameExpectedErr:
387 error(U_BRK_RULE_SYNTAX);
388 break;
389
390
391 //
392 // Unary operands + ? *
393 // These all appear after the operand to which they apply.
394 // When we hit one, the operand (may be a whole sub expression)
395 // will be on the top of the stack.
396 // Unary Operator becomes TOS, with the old TOS as its one child.
397 case doUnaryOpPlus:
398 {
399 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
400 RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
Jungshik Shin70f82502016-01-29 00:32:36 -0800401 if (U_FAILURE(*fRB->fStatus)) {
402 break;
403 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000404 plusNode->fLeftChild = operandNode;
405 operandNode->fParent = plusNode;
406 }
407 break;
408
409 case doUnaryOpQuestion:
410 {
411 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
412 RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
Jungshik Shin70f82502016-01-29 00:32:36 -0800413 if (U_FAILURE(*fRB->fStatus)) {
414 break;
415 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000416 qNode->fLeftChild = operandNode;
417 operandNode->fParent = qNode;
418 }
419 break;
420
421 case doUnaryOpStar:
422 {
423 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
424 RBBINode *starNode = pushNewNode(RBBINode::opStar);
Jungshik Shin70f82502016-01-29 00:32:36 -0800425 if (U_FAILURE(*fRB->fStatus)) {
426 break;
427 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000428 starNode->fLeftChild = operandNode;
429 operandNode->fParent = starNode;
430 }
431 break;
432
433 case doRuleChar:
434 // A "Rule Character" is any single character that is a literal part
435 // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]"
436 // These are pretty uncommon in break rules; the terms are more commonly
437 // sets. To keep things uniform, treat these characters like as
438 // sets that just happen to contain only one character.
439 {
440 n = pushNewNode(RBBINode::setRef);
Jungshik Shin70f82502016-01-29 00:32:36 -0800441 if (U_FAILURE(*fRB->fStatus)) {
442 break;
443 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000444 findSetFor(UnicodeString(fC.fChar), n);
445 n->fFirstPos = fScanIndex;
446 n->fLastPos = fNextIndex;
447 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
448 break;
449 }
450
451 case doDotAny:
452 // scanned a ".", meaning match any single character.
453 {
454 n = pushNewNode(RBBINode::setRef);
Jungshik Shin70f82502016-01-29 00:32:36 -0800455 if (U_FAILURE(*fRB->fStatus)) {
456 break;
457 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000458 findSetFor(UnicodeString(TRUE, kAny, 3), n);
459 n->fFirstPos = fScanIndex;
460 n->fLastPos = fNextIndex;
461 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
462 break;
463 }
464
465 case doSlash:
466 // Scanned a '/', which identifies a look-ahead break position in a rule.
467 n = pushNewNode(RBBINode::lookAhead);
Jungshik Shin70f82502016-01-29 00:32:36 -0800468 if (U_FAILURE(*fRB->fStatus)) {
469 break;
470 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000471 n->fVal = fRuleNum;
472 n->fFirstPos = fScanIndex;
473 n->fLastPos = fNextIndex;
474 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
475 fLookAheadRule = TRUE;
476 break;
477
478
479 case doStartTagValue:
480 // Scanned a '{', the opening delimiter for a tag value within a rule.
481 n = pushNewNode(RBBINode::tag);
Jungshik Shin70f82502016-01-29 00:32:36 -0800482 if (U_FAILURE(*fRB->fStatus)) {
483 break;
484 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000485 n->fVal = 0;
486 n->fFirstPos = fScanIndex;
487 n->fLastPos = fNextIndex;
488 break;
489
490 case doTagDigit:
491 // Just scanned a decimal digit that's part of a tag value
492 {
493 n = fNodeStack[fNodeStackPtr];
494 uint32_t v = u_charDigitValue(fC.fChar);
495 U_ASSERT(v < 10);
496 n->fVal = n->fVal*10 + v;
497 break;
498 }
499
500 case doTagValue:
501 n = fNodeStack[fNodeStackPtr];
502 n->fLastPos = fNextIndex;
503 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
504 break;
505
506 case doTagExpectedError:
507 error(U_BRK_MALFORMED_RULE_TAG);
508 returnVal = FALSE;
509 break;
510
511 case doOptionStart:
512 // Scanning a !!option. At the start of string.
513 fOptionStart = fScanIndex;
514 break;
515
516 case doOptionEnd:
517 {
518 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
519 if (opt == UNICODE_STRING("chain", 5)) {
520 fRB->fChainRules = TRUE;
521 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
522 fRB->fLBCMNoChain = TRUE;
523 } else if (opt == UNICODE_STRING("forward", 7)) {
524 fRB->fDefaultTree = &fRB->fForwardTree;
525 } else if (opt == UNICODE_STRING("reverse", 7)) {
526 fRB->fDefaultTree = &fRB->fReverseTree;
527 } else if (opt == UNICODE_STRING("safe_forward", 12)) {
528 fRB->fDefaultTree = &fRB->fSafeFwdTree;
529 } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
530 fRB->fDefaultTree = &fRB->fSafeRevTree;
531 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
532 fRB->fLookAheadHardBreak = TRUE;
533 } else {
534 error(U_BRK_UNRECOGNIZED_OPTION);
535 }
536 }
537 break;
538
539 case doReverseDir:
540 fReverseRule = TRUE;
541 break;
542
543 case doStartVariableName:
544 n = pushNewNode(RBBINode::varRef);
545 if (U_FAILURE(*fRB->fStatus)) {
546 break;
547 }
548 n->fFirstPos = fScanIndex;
549 break;
550
551 case doEndVariableName:
552 n = fNodeStack[fNodeStackPtr];
553 if (n==NULL || n->fType != RBBINode::varRef) {
554 error(U_BRK_INTERNAL_ERROR);
555 break;
556 }
557 n->fLastPos = fScanIndex;
558 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
559 // Look the newly scanned name up in the symbol table
560 // If there's an entry, set the l. child of the var ref to the replacement expression.
561 // (We also pass through here when scanning assignments, but no harm is done, other
562 // than a slight wasted effort that seems hard to avoid. Lookup will be null)
563 n->fLeftChild = fSymbolTable->lookupNode(n->fText);
564 break;
565
566 case doCheckVarDef:
567 n = fNodeStack[fNodeStackPtr];
568 if (n->fLeftChild == NULL) {
569 error(U_BRK_UNDEFINED_VARIABLE);
570 returnVal = FALSE;
571 }
572 break;
573
574 case doExprFinished:
575 break;
576
577 case doRuleErrorAssignExpr:
578 error(U_BRK_ASSIGN_ERROR);
579 returnVal = FALSE;
580 break;
581
582 case doExit:
583 returnVal = FALSE;
584 break;
585
586 case doScanUnicodeSet:
587 scanSet();
588 break;
589
590 default:
591 error(U_BRK_INTERNAL_ERROR);
592 returnVal = FALSE;
593 break;
594 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800595 return returnVal && U_SUCCESS(*fRB->fStatus);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000596}
597
598
599
600
601//------------------------------------------------------------------------------
602//
603// Error Report a rule parse error.
604// Only report it if no previous error has been recorded.
605//
606//------------------------------------------------------------------------------
607void RBBIRuleScanner::error(UErrorCode e) {
608 if (U_SUCCESS(*fRB->fStatus)) {
609 *fRB->fStatus = e;
610 if (fRB->fParseError) {
611 fRB->fParseError->line = fLineNum;
612 fRB->fParseError->offset = fCharNum;
613 fRB->fParseError->preContext[0] = 0;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800614 fRB->fParseError->postContext[0] = 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000615 }
616 }
617}
618
619
620
621
622//------------------------------------------------------------------------------
623//
624// fixOpStack The parse stack holds partially assembled chunks of the parse tree.
625// An entry on the stack may be as small as a single setRef node,
626// or as large as the parse tree
627// for an entire expression (this will be the one item left on the stack
628// when the parsing of an RBBI rule completes.
629//
630// This function is called when a binary operator is encountered.
631// It looks back up the stack for operators that are not yet associated
632// with a right operand, and if the precedence of the stacked operator >=
633// the precedence of the current operator, binds the operand left,
634// to the previously encountered operator.
635//
636//------------------------------------------------------------------------------
637void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
638 RBBINode *n;
639 // printNodeStack("entering fixOpStack()");
640 for (;;) {
641 n = fNodeStack[fNodeStackPtr-1]; // an operator node
642 if (n->fPrecedence == 0) {
643 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
644 error(U_BRK_INTERNAL_ERROR);
645 return;
646 }
647
648 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
649 // The most recent operand goes with the current operator,
650 // not with the previously stacked one.
651 break;
652 }
653 // Stack operator is a binary op ( '|' or concatenation)
654 // TOS operand becomes right child of this operator.
655 // Resulting subexpression becomes the TOS operand.
656 n->fRightChild = fNodeStack[fNodeStackPtr];
657 fNodeStack[fNodeStackPtr]->fParent = n;
658 fNodeStackPtr--;
659 // printNodeStack("looping in fixOpStack() ");
660 }
661
662 if (p <= RBBINode::precLParen) {
663 // Scan is at a right paren or end of expression.
664 // The scanned item must match the stack, or else there was an error.
665 // Discard the left paren (or start expr) node from the stack,
666 // leaving the completed (sub)expression as TOS.
667 if (n->fPrecedence != p) {
668 // Right paren encountered matched start of expression node, or
669 // end of expression matched with a left paren node.
670 error(U_BRK_MISMATCHED_PAREN);
671 }
672 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
673 fNodeStackPtr--;
674 // Delete the now-discarded LParen or Start node.
675 delete n;
676 }
677 // printNodeStack("leaving fixOpStack()");
678}
679
680
681
682
683//------------------------------------------------------------------------------
684//
685// findSetFor given a UnicodeString,
686// - find the corresponding Unicode Set (uset node)
687// (create one if necessary)
688// - Set fLeftChild of the caller's node (should be a setRef node)
689// to the uset node
690// Maintain a hash table of uset nodes, so the same one is always used
691// for the same string.
692// If a "to adopt" set is provided and we haven't seen this key before,
693// add the provided set to the hash table.
694// If the string is one (32 bit) char in length, the set contains
695// just one element which is the char in question.
696// If the string is "any", return a set containing all chars.
697//
698//------------------------------------------------------------------------------
699void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
700
701 RBBISetTableEl *el;
702
703 // First check whether we've already cached a set for this string.
704 // If so, just use the cached set in the new node.
705 // delete any set provided by the caller, since we own it.
706 el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
707 if (el != NULL) {
708 delete setToAdopt;
709 node->fLeftChild = el->val;
710 U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
711 return;
712 }
713
714 // Haven't seen this set before.
715 // If the caller didn't provide us with a prebuilt set,
716 // create a new UnicodeSet now.
717 if (setToAdopt == NULL) {
718 if (s.compare(kAny, -1) == 0) {
719 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
720 } else {
721 UChar32 c;
722 c = s.char32At(0);
723 setToAdopt = new UnicodeSet(c, c);
724 }
725 }
726
727 //
728 // Make a new uset node to refer to this UnicodeSet
729 // This new uset node becomes the child of the caller's setReference node.
730 //
731 RBBINode *usetNode = new RBBINode(RBBINode::uset);
732 if (usetNode == NULL) {
733 error(U_MEMORY_ALLOCATION_ERROR);
734 return;
735 }
736 usetNode->fInputSet = setToAdopt;
737 usetNode->fParent = node;
738 node->fLeftChild = usetNode;
739 usetNode->fText = s;
740
741
742 //
743 // Add the new uset node to the list of all uset nodes.
744 //
745 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
746
747
748 //
749 // Add the new set to the set hash table.
750 //
751 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
752 UnicodeString *tkey = new UnicodeString(s);
753 if (tkey == NULL || el == NULL || setToAdopt == NULL) {
754 // Delete to avoid memory leak
755 delete tkey;
756 tkey = NULL;
757 uprv_free(el);
758 el = NULL;
759 delete setToAdopt;
760 setToAdopt = NULL;
761
762 error(U_MEMORY_ALLOCATION_ERROR);
763 return;
764 }
765 el->key = tkey;
766 el->val = usetNode;
767 uhash_put(fSetTable, el->key, el, fRB->fStatus);
768
769 return;
770}
771
772
773
774//
775// Assorted Unicode character constants.
776// Numeric because there is no portable way to enter them as literals.
777// (Think EBCDIC).
778//
779static const UChar chCR = 0x0d; // New lines, for terminating comments.
780static const UChar chLF = 0x0a;
781static const UChar chNEL = 0x85; // NEL newline variant
782static const UChar chLS = 0x2028; // Unicode Line Separator
783static const UChar chApos = 0x27; // single quote, for quoted chars.
784static const UChar chPound = 0x23; // '#', introduces a comment.
785static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
786static const UChar chLParen = 0x28;
787static const UChar chRParen = 0x29;
788
789
790//------------------------------------------------------------------------------
791//
792// stripRules Return a rules string without unnecessary
793// characters.
794//
795//------------------------------------------------------------------------------
796UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
797 UnicodeString strippedRules;
798 int rulesLength = rules.length();
799 for (int idx = 0; idx < rulesLength; ) {
800 UChar ch = rules[idx++];
801 if (ch == chPound) {
802 while (idx < rulesLength
803 && ch != chCR && ch != chLF && ch != chNEL)
804 {
805 ch = rules[idx++];
806 }
807 }
808 if (!u_isISOControl(ch)) {
809 strippedRules.append(ch);
810 }
811 }
812 // strippedRules = strippedRules.unescape();
813 return strippedRules;
814}
815
816
817//------------------------------------------------------------------------------
818//
819// nextCharLL Low Level Next Char from rule input source.
820// Get a char from the input character iterator,
821// keep track of input position for error reporting.
822//
823//------------------------------------------------------------------------------
824UChar32 RBBIRuleScanner::nextCharLL() {
825 UChar32 ch;
826
827 if (fNextIndex >= fRB->fRules.length()) {
828 return (UChar32)-1;
829 }
830 ch = fRB->fRules.char32At(fNextIndex);
831 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
832
833 if (ch == chCR ||
834 ch == chNEL ||
835 ch == chLS ||
836 (ch == chLF && fLastChar != chCR)) {
837 // Character is starting a new line. Bump up the line number, and
838 // reset the column to 0.
839 fLineNum++;
840 fCharNum=0;
841 if (fQuoteMode) {
842 error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
843 fQuoteMode = FALSE;
844 }
845 }
846 else {
847 // Character is not starting a new line. Except in the case of a
848 // LF following a CR, increment the column position.
849 if (ch != chLF) {
850 fCharNum++;
851 }
852 }
853 fLastChar = ch;
854 return ch;
855}
856
857
858//------------------------------------------------------------------------------
859//
860// nextChar for rules scanning. At this level, we handle stripping
861// out comments and processing backslash character escapes.
862// The rest of the rules grammar is handled at the next level up.
863//
864//------------------------------------------------------------------------------
865void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
866
867 // Unicode Character constants needed for the processing done by nextChar(),
868 // in hex because literals wont work on EBCDIC machines.
869
870 fScanIndex = fNextIndex;
871 c.fChar = nextCharLL();
872 c.fEscaped = FALSE;
873
874 //
875 // check for '' sequence.
876 // These are recognized in all contexts, whether in quoted text or not.
877 //
878 if (c.fChar == chApos) {
879 if (fRB->fRules.char32At(fNextIndex) == chApos) {
880 c.fChar = nextCharLL(); // get nextChar officially so character counts
881 c.fEscaped = TRUE; // stay correct.
882 }
883 else
884 {
885 // Single quote, by itself.
886 // Toggle quoting mode.
887 // Return either '(' or ')', because quotes cause a grouping of the quoted text.
888 fQuoteMode = !fQuoteMode;
889 if (fQuoteMode == TRUE) {
890 c.fChar = chLParen;
891 } else {
892 c.fChar = chRParen;
893 }
894 c.fEscaped = FALSE; // The paren that we return is not escaped.
895 return;
896 }
897 }
898
899 if (fQuoteMode) {
900 c.fEscaped = TRUE;
901 }
902 else
903 {
904 // We are not in a 'quoted region' of the source.
905 //
906 if (c.fChar == chPound) {
907 // Start of a comment. Consume the rest of it.
908 // The new-line char that terminates the comment is always returned.
909 // It will be treated as white-space, and serves to break up anything
910 // that might otherwise incorrectly clump together with a comment in
911 // the middle (a variable name, for example.)
912 for (;;) {
913 c.fChar = nextCharLL();
914 if (c.fChar == (UChar32)-1 || // EOF
915 c.fChar == chCR ||
916 c.fChar == chLF ||
917 c.fChar == chNEL ||
918 c.fChar == chLS) {break;}
919 }
920 }
921 if (c.fChar == (UChar32)-1) {
922 return;
923 }
924
925 //
926 // check for backslash escaped characters.
927 // Use UnicodeString::unescapeAt() to handle them.
928 //
929 if (c.fChar == chBackSlash) {
930 c.fEscaped = TRUE;
931 int32_t startX = fNextIndex;
932 c.fChar = fRB->fRules.unescapeAt(fNextIndex);
933 if (fNextIndex == startX) {
934 error(U_BRK_HEX_DIGITS_EXPECTED);
935 }
936 fCharNum += fNextIndex-startX;
937 }
938 }
939 // putc(c.fChar, stdout);
940}
941
942//------------------------------------------------------------------------------
943//
944// Parse RBBI rules. The state machine for rules parsing is here.
945// The state tables are hand-written in the file rbbirpt.txt,
946// and converted to the form used here by a perl
947// script rbbicst.pl
948//
949//------------------------------------------------------------------------------
950void RBBIRuleScanner::parse() {
951 uint16_t state;
952 const RBBIRuleTableEl *tableEl;
953
954 if (U_FAILURE(*fRB->fStatus)) {
955 return;
956 }
957
958 state = 1;
959 nextChar(fC);
960 //
961 // Main loop for the rule parsing state machine.
962 // Runs once per state transition.
963 // Each time through optionally performs, depending on the state table,
964 // - an advance to the the next input char
965 // - an action to be performed.
966 // - pushing or popping a state to/from the local state return stack.
967 //
968 for (;;) {
969 // Bail out if anything has gone wrong.
970 // RBBI rule file parsing stops on the first error encountered.
971 if (U_FAILURE(*fRB->fStatus)) {
972 break;
973 }
974
975 // Quit if state == 0. This is the normal way to exit the state machine.
976 //
977 if (state == 0) {
978 break;
979 }
980
981 // Find the state table element that matches the input char from the rule, or the
982 // class of the input character. Start with the first table row for this
983 // state, then linearly scan forward until we find a row that matches the
984 // character. The last row for each state always matches all characters, so
985 // the search will stop there, if not before.
986 //
987 tableEl = &gRuleParseStateTable[state];
988 #ifdef RBBI_DEBUG
989 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
990 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
991 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
992 }
993 #endif
994
995 for (;;) {
996 #ifdef RBBI_DEBUG
997 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
998 #endif
999 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
1000 // Table row specified an individual character, not a set, and
1001 // the input character is not escaped, and
1002 // the input character matched it.
1003 break;
1004 }
1005 if (tableEl->fCharClass == 255) {
1006 // Table row specified default, match anything character class.
1007 break;
1008 }
1009 if (tableEl->fCharClass == 254 && fC.fEscaped) {
1010 // Table row specified "escaped" and the char was escaped.
1011 break;
1012 }
1013 if (tableEl->fCharClass == 253 && fC.fEscaped &&
1014 (fC.fChar == 0x50 || fC.fChar == 0x70 )) {
1015 // Table row specified "escaped P" and the char is either 'p' or 'P'.
1016 break;
1017 }
1018 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {
1019 // Table row specified eof and we hit eof on the input.
1020 break;
1021 }
1022
1023 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
1024 fC.fEscaped == FALSE && // char is not escaped &&
1025 fC.fChar != (UChar32)-1) { // char is not EOF
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001026 U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001027 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
1028 // Table row specified a character class, or set of characters,
1029 // and the current char matches it.
1030 break;
1031 }
1032 }
1033
1034 // No match on this row, advance to the next row for this state,
1035 tableEl++;
1036 }
1037 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
1038
1039 //
1040 // We've found the row of the state table that matches the current input
1041 // character from the rules string.
1042 // Perform any action specified by this row in the state table.
1043 if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
1044 // Break out of the state machine loop if the
1045 // the action signalled some kind of error, or
1046 // the action was to exit, occurs on normal end-of-rules-input.
1047 break;
1048 }
1049
1050 if (tableEl->fPushState != 0) {
1051 fStackPtr++;
1052 if (fStackPtr >= kStackSize) {
1053 error(U_BRK_INTERNAL_ERROR);
1054 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
1055 fStackPtr--;
1056 }
1057 fStack[fStackPtr] = tableEl->fPushState;
1058 }
1059
1060 if (tableEl->fNextChar) {
1061 nextChar(fC);
1062 }
1063
1064 // Get the next state from the table entry, or from the
1065 // state stack if the next state was specified as "pop".
1066 if (tableEl->fNextState != 255) {
1067 state = tableEl->fNextState;
1068 } else {
1069 state = fStack[fStackPtr];
1070 fStackPtr--;
1071 if (fStackPtr < 0) {
1072 error(U_BRK_INTERNAL_ERROR);
1073 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
1074 fStackPtr++;
1075 }
1076 }
1077
1078 }
1079
1080 //
1081 // If there were NO user specified reverse rules, set up the equivalent of ".*;"
1082 //
1083 if (fRB->fReverseTree == NULL) {
1084 fRB->fReverseTree = pushNewNode(RBBINode::opStar);
1085 RBBINode *operand = pushNewNode(RBBINode::setRef);
Jungshik Shin70f82502016-01-29 00:32:36 -08001086 if (U_FAILURE(*fRB->fStatus)) {
1087 return;
1088 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001089 findSetFor(UnicodeString(TRUE, kAny, 3), operand);
1090 fRB->fReverseTree->fLeftChild = operand;
1091 operand->fParent = fRB->fReverseTree;
1092 fNodeStackPtr -= 2;
1093 }
1094
1095
1096 //
1097 // Parsing of the input RBBI rules is complete.
1098 // We now have a parse tree for the rule expressions
1099 // and a list of all UnicodeSets that are referenced.
1100 //
1101#ifdef RBBI_DEBUG
1102 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
1103 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
1104 {
1105 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1106 fRB->fForwardTree->printTree(TRUE);
1107 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1108 fRB->fReverseTree->printTree(TRUE);
1109 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
1110 fRB->fSafeFwdTree->printTree(TRUE);
1111 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
1112 fRB->fSafeRevTree->printTree(TRUE);
1113 }
1114#endif
1115}
1116
1117
1118//------------------------------------------------------------------------------
1119//
1120// printNodeStack for debugging...
1121//
1122//------------------------------------------------------------------------------
1123#ifdef RBBI_DEBUG
1124void RBBIRuleScanner::printNodeStack(const char *title) {
1125 int i;
1126 RBBIDebugPrintf("%s. Dumping node stack...\n", title);
1127 for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}
1128}
1129#endif
1130
1131
1132
1133
1134//------------------------------------------------------------------------------
1135//
1136// pushNewNode create a new RBBINode of the specified type and push it
1137// onto the stack of nodes.
1138//
1139//------------------------------------------------------------------------------
1140RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001141 if (U_FAILURE(*fRB->fStatus)) {
1142 return NULL;
1143 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001144 fNodeStackPtr++;
1145 if (fNodeStackPtr >= kStackSize) {
1146 error(U_BRK_INTERNAL_ERROR);
1147 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
1148 *fRB->fStatus = U_BRK_INTERNAL_ERROR;
1149 return NULL;
1150 }
1151 fNodeStack[fNodeStackPtr] = new RBBINode(t);
1152 if (fNodeStack[fNodeStackPtr] == NULL) {
1153 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1154 }
1155 return fNodeStack[fNodeStackPtr];
1156}
1157
1158
1159
1160//------------------------------------------------------------------------------
1161//
1162// scanSet Construct a UnicodeSet from the text at the current scan
1163// position. Advance the scan position to the first character
1164// after the set.
1165//
1166// A new RBBI setref node referring to the set is pushed onto the node
1167// stack.
1168//
1169// The scan position is normally under the control of the state machine
1170// that controls rule parsing. UnicodeSets, however, are parsed by
1171// the UnicodeSet constructor, not by the RBBI rule parser.
1172//
1173//------------------------------------------------------------------------------
1174void RBBIRuleScanner::scanSet() {
1175 UnicodeSet *uset;
1176 ParsePosition pos;
1177 int startPos;
1178 int i;
1179
1180 if (U_FAILURE(*fRB->fStatus)) {
1181 return;
1182 }
1183
1184 pos.setIndex(fScanIndex);
1185 startPos = fScanIndex;
1186 UErrorCode localStatus = U_ZERO_ERROR;
1187 uset = new UnicodeSet();
1188 if (uset == NULL) {
1189 localStatus = U_MEMORY_ALLOCATION_ERROR;
1190 } else {
1191 uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
1192 }
1193 if (U_FAILURE(localStatus)) {
1194 // TODO: Get more accurate position of the error from UnicodeSet's return info.
1195 // UnicodeSet appears to not be reporting correctly at this time.
1196 #ifdef RBBI_DEBUG
1197 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
1198 #endif
1199 error(localStatus);
1200 delete uset;
1201 return;
1202 }
1203
1204 // Verify that the set contains at least one code point.
1205 //
1206 U_ASSERT(uset!=NULL);
1207 if (uset->isEmpty()) {
1208 // This set is empty.
1209 // Make it an error, because it almost certainly is not what the user wanted.
1210 // Also, avoids having to think about corner cases in the tree manipulation code
1211 // that occurs later on.
1212 error(U_BRK_RULE_EMPTY_SET);
1213 delete uset;
1214 return;
1215 }
1216
1217
1218 // Advance the RBBI parse postion over the UnicodeSet pattern.
1219 // Don't just set fScanIndex because the line/char positions maintained
1220 // for error reporting would be thrown off.
1221 i = pos.getIndex();
1222 for (;;) {
1223 if (fNextIndex >= i) {
1224 break;
1225 }
1226 nextCharLL();
1227 }
1228
1229 if (U_SUCCESS(*fRB->fStatus)) {
1230 RBBINode *n;
1231
1232 n = pushNewNode(RBBINode::setRef);
Jungshik Shin70f82502016-01-29 00:32:36 -08001233 if (U_FAILURE(*fRB->fStatus)) {
1234 return;
1235 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001236 n->fFirstPos = startPos;
1237 n->fLastPos = fNextIndex;
1238 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1239 // findSetFor() serves several purposes here:
1240 // - Adopts storage for the UnicodeSet, will be responsible for deleting.
1241 // - Mantains collection of all sets in use, needed later for establishing
1242 // character categories for run time engine.
1243 // - Eliminates mulitiple instances of the same set.
1244 // - Creates a new uset node if necessary (if this isn't a duplicate.)
1245 findSetFor(n->fText, n, uset);
1246 }
1247
1248}
1249
1250U_NAMESPACE_END
1251
1252#endif /* #if !UCONFIG_NO_BREAK_ITERATION */