Blame - source/common/rbbiscan.cpp - chromium.googlesource.com/chromium/deps/icu

blob: dd33aaeb46db71d51495f8299ec65661236c8bfa [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1
				2	//
				3	// file: rbbiscan.cpp
				4	//
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	5	// Copyright (C) 2002-2014, International Business Machines Corporation and others.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	6	// All Rights Reserved.
				7	//
				8	// This file contains the Rule Based Break Iterator Rule Builder functions for
				9	// scanning the rules and assembling a parse tree. This is the first phase
				10	// of compiling the rules.
				11	//
				12	// The overall of the rules is managed by class RBBIRuleBuilder, which will
				13	// create and use an instance of this class as part of the process.
				14	//
				15
				16	#include "unicode/utypes.h"
				17
				18	#if !UCONFIG_NO_BREAK_ITERATION
				19
				20	#include "unicode/unistr.h"
				21	#include "unicode/uniset.h"
				22	#include "unicode/uchar.h"
				23	#include "unicode/uchriter.h"
				24	#include "unicode/parsepos.h"
				25	#include "unicode/parseerr.h"
				26	#include "cmemory.h"
				27	#include "cstring.h"
				28
				29	#include "rbbirpt.h" // Contains state table for the rbbi rules parser.
				30	// generated by a Perl script.
				31	#include "rbbirb.h"
				32	#include "rbbinode.h"
				33	#include "rbbiscan.h"
				34	#include "rbbitblb.h"
				35
				36	#include "uassert.h"
				37
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	38	//------------------------------------------------------------------------------
				39	//
				40	// Unicode Set init strings for each of the character classes needed for parsing a rule file.
				41	// (Initialized with hex values for portability to EBCDIC based machines.
				42	// Really ugly, but there's no good way to avoid it.)
				43	//
				44	// The sets are referred to by name in the rbbirpt.txt, which is the
				45	// source form of the state transition table for the RBBI rule parser.
				46	//
				47	//------------------------------------------------------------------------------
				48	static const UChar gRuleSet_rule_char_pattern[] = {
				49	// [ ^ [ \ p { Z } \ u 0 0 2 0
				50	0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
				51	// - \ u 0 0 7 f ] - [ \ p
				52	0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
				53	// { L } ] - [ \ p { N } ] ]
				54	0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
				55
				56	static const UChar gRuleSet_name_char_pattern[] = {
				57	// [ _ \ p { L } \ p { N } ]
				58	0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
				59
				60	static const UChar gRuleSet_digit_char_pattern[] = {
				61	// [ 0 - 9 ]
				62	0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
				63
				64	static const UChar gRuleSet_name_start_char_pattern[] = {
				65	// [ _ \ p { L } ]
				66	0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
				67
				68	static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
				69
				70
				71	U_CDECL_BEGIN
				72	static void U_CALLCONV RBBISetTable_deleter(void *p) {
				73	icu::RBBISetTableEl px = (icu::RBBISetTableEl )p;
				74	delete px->key;
				75	// Note: px->val is owned by the linked list "fSetsListHead" in scanner.
				76	// Don't delete the value nodes here.
				77	uprv_free(px);
				78	}
				79	U_CDECL_END
				80
				81	U_NAMESPACE_BEGIN
				82
				83	//------------------------------------------------------------------------------
				84	//
				85	// Constructor.
				86	//
				87	//------------------------------------------------------------------------------
				88	RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
				89	{
				90	fRB = rb;
				91	fStackPtr = 0;
				92	fStack[fStackPtr] = 0;
				93	fNodeStackPtr = 0;
				94	fRuleNum = 0;
				95	fNodeStack[0] = NULL;
				96
				97	fSymbolTable = NULL;
				98	fSetTable = NULL;
				99
				100	fScanIndex = 0;
				101	fNextIndex = 0;
				102
				103	fReverseRule = FALSE;
				104	fLookAheadRule = FALSE;
				105
				106	fLineNum = 1;
				107	fCharNum = 0;
				108	fQuoteMode = FALSE;
				109
				110	// Do not check status until after all critical fields are sufficiently initialized
				111	// that the destructor can run cleanly.
				112	if (U_FAILURE(*rb->fStatus)) {
				113	return;
				114	}
				115
				116	//
				117	// Set up the constant Unicode Sets.
				118	// Note: These could be made static, lazily initialized, and shared among
				119	// all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
				120	// and the time to build these few sets should be small compared to a
				121	// full break iterator build.
				122	fRuleSets[kRuleSet_rule_char-128]
				123	= UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus);
				124	// fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
				125	fRuleSets[kRuleSet_white_space-128].
				126	add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
				127	fRuleSets[kRuleSet_name_char-128]
				128	= UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);
				129	fRuleSets[kRuleSet_name_start_char-128]
				130	= UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
				131	fRuleSets[kRuleSet_digit_char-128]
				132	= UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus);
				133	if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
				134	// This case happens if ICU's data is missing. UnicodeSet tries to look up property
				135	// names from the init string, can't find them, and claims an illegal argument.
				136	// Change the error so that the actual problem will be clearer to users.
				137	*rb->fStatus = U_BRK_INIT_ERROR;
				138	}
				139	if (U_FAILURE(*rb->fStatus)) {
				140	return;
				141	}
				142
				143	fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
				144	if (fSymbolTable == NULL) {
				145	*rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
				146	return;
				147	}
				148	fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);
				149	if (U_FAILURE(*rb->fStatus)) {
				150	return;
				151	}
				152	uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
				153	}
				154
				155
				156
				157	//------------------------------------------------------------------------------
				158	//
				159	// Destructor
				160	//
				161	//------------------------------------------------------------------------------
				162	RBBIRuleScanner::~RBBIRuleScanner() {
				163	delete fSymbolTable;
				164	if (fSetTable != NULL) {
				165	uhash_close(fSetTable);
				166	fSetTable = NULL;
				167
				168	}
				169
				170
				171	// Node Stack.
				172	// Normally has one entry, which is the entire parse tree for the rules.
				173	// If errors occured, there may be additional subtrees left on the stack.
				174	while (fNodeStackPtr > 0) {
				175	delete fNodeStack[fNodeStackPtr];
				176	fNodeStackPtr--;
				177	}
				178
				179	}
				180
				181	//------------------------------------------------------------------------------
				182	//
				183	// doParseAction Do some action during rule parsing.
				184	// Called by the parse state machine.
				185	// Actions build the parse tree and Unicode Sets,
				186	// and maintain the parse stack for nested expressions.
				187	//
				188	// TODO: unify EParseAction and RBBI_RuleParseAction enum types.
				189	// They represent exactly the same thing. They're separate
				190	// only to work around enum forward declaration restrictions
				191	// in some compilers, while at the same time avoiding multiple
				192	// definitions problems. I'm sure that there's a better way.
				193	//
				194	//------------------------------------------------------------------------------
				195	UBool RBBIRuleScanner::doParseActions(int32_t action)
				196	{
				197	RBBINode *n = NULL;
				198
				199	UBool returnVal = TRUE;
				200
				201	switch (action) {
				202
				203	case doExprStart:
				204	pushNewNode(RBBINode::opStart);
				205	fRuleNum++;
				206	break;
				207
				208
				209	case doExprOrOperator:
				210	{
				211	fixOpStack(RBBINode::precOpCat);
				212	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				213	RBBINode *orNode = pushNewNode(RBBINode::opOr);
				214	orNode->fLeftChild = operandNode;
				215	operandNode->fParent = orNode;
				216	}
				217	break;
				218
				219	case doExprCatOperator:
				220	// concatenation operator.
				221	// For the implicit concatenation of adjacent terms in an expression that are
				222	// not separated by any other operator. Action is invoked between the
				223	// actions for the two terms.
				224	{
				225	fixOpStack(RBBINode::precOpCat);
				226	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				227	RBBINode *catNode = pushNewNode(RBBINode::opCat);
				228	catNode->fLeftChild = operandNode;
				229	operandNode->fParent = catNode;
				230	}
				231	break;
				232
				233	case doLParen:
				234	// Open Paren.
				235	// The openParen node is a dummy operation type with a low precedence,
				236	// which has the affect of ensuring that any real binary op that
				237	// follows within the parens binds more tightly to the operands than
				238	// stuff outside of the parens.
				239	pushNewNode(RBBINode::opLParen);
				240	break;
				241
				242	case doExprRParen:
				243	fixOpStack(RBBINode::precLParen);
				244	break;
				245
				246	case doNOP:
				247	break;
				248
				249	case doStartAssign:
				250	// We've just scanned "$variable = "
				251	// The top of the node stack has the $variable ref node.
				252
				253	// Save the start position of the RHS text in the StartExpression node
				254	// that precedes the $variableReference node on the stack.
				255	// This will eventually be used when saving the full $variable replacement
				256	// text as a string.
				257	n = fNodeStack[fNodeStackPtr-1];
				258	n->fFirstPos = fNextIndex; // move past the '='
				259
				260	// Push a new start-of-expression node; needed to keep parse of the
				261	// RHS expression happy.
				262	pushNewNode(RBBINode::opStart);
				263	break;
				264
				265
				266
				267
				268	case doEndAssign:
				269	{
				270	// We have reached the end of an assignement statement.
				271	// Current scan char is the ';' that terminates the assignment.
				272
				273	// Terminate expression, leaves expression parse tree rooted in TOS node.
				274	fixOpStack(RBBINode::precStart);
				275
				276	RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];
				277	RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];
				278	RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
				279
				280	// Save original text of right side of assignment, excluding the terminating ';'
				281	// in the root of the node for the right-hand-side expression.
				282	RHSExprNode->fFirstPos = startExprNode->fFirstPos;
				283	RHSExprNode->fLastPos = fScanIndex;
				284	fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
				285
				286	// Expression parse tree becomes l. child of the $variable reference node.
				287	varRefNode->fLeftChild = RHSExprNode;
				288	RHSExprNode->fParent = varRefNode;
				289
				290	// Make a symbol table entry for the $variableRef node.
				291	fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
				292	if (U_FAILURE(*fRB->fStatus)) {
				293	// This is a round-about way to get the parse position set
				294	// so that duplicate symbols error messages include a line number.
				295	UErrorCode t = *fRB->fStatus;
				296	*fRB->fStatus = U_ZERO_ERROR;
				297	error(t);
				298	}
				299
				300	// Clean up the stack.
				301	delete startExprNode;
				302	fNodeStackPtr-=3;
				303	break;
				304	}
				305
				306	case doEndOfRule:
				307	{
				308	fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
				309	if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node.
				310	break;
				311	}
				312	#ifdef RBBI_DEBUG
				313	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
				314	#endif
				315	U_ASSERT(fNodeStackPtr == 1);
				316
				317	// If this rule includes a look-ahead '/', add a endMark node to the
				318	// expression tree.
				319	if (fLookAheadRule) {
				320	RBBINode *thisRule = fNodeStack[fNodeStackPtr];
				321	RBBINode *endNode = pushNewNode(RBBINode::endMark);
				322	RBBINode *catNode = pushNewNode(RBBINode::opCat);
				323	fNodeStackPtr -= 2;
				324	catNode->fLeftChild = thisRule;
				325	catNode->fRightChild = endNode;
				326	fNodeStack[fNodeStackPtr] = catNode;
				327	endNode->fVal = fRuleNum;
				328	endNode->fLookAheadEnd = TRUE;
				329	}
				330
				331	// All rule expressions are ORed together.
				332	// The ';' that terminates an expression really just functions as a '\|' with
				333	// a low operator prededence.
				334	//
				335	// Each of the four sets of rules are collected separately.
				336	// (forward, reverse, safe_forward, safe_reverse)
				337	// OR this rule into the appropriate group of them.
				338	//
				339	RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
				340
				341	if (*destRules != NULL) {
				342	// This is not the first rule encounted.
				343	// OR previous stuff (from *destRules)
				344	// with the current rule expression (on the Node Stack)
				345	// with the resulting OR expression going to *destRules
				346	//
				347	RBBINode *thisRule = fNodeStack[fNodeStackPtr];
				348	RBBINode prevRules = destRules;
				349	RBBINode *orNode = pushNewNode(RBBINode::opOr);
				350	orNode->fLeftChild = prevRules;
				351	prevRules->fParent = orNode;
				352	orNode->fRightChild = thisRule;
				353	thisRule->fParent = orNode;
				354	*destRules = orNode;
				355	}
				356	else
				357	{
				358	// This is the first rule encountered (for this direction).
				359	// Just move its parse tree from the stack to *destRules.
				360	*destRules = fNodeStack[fNodeStackPtr];
				361	}
				362	fReverseRule = FALSE; // in preparation for the next rule.
				363	fLookAheadRule = FALSE;
				364	fNodeStackPtr = 0;
				365	}
				366	break;
				367
				368
				369	case doRuleError:
				370	error(U_BRK_RULE_SYNTAX);
				371	returnVal = FALSE;
				372	break;
				373
				374
				375	case doVariableNameExpectedErr:
				376	error(U_BRK_RULE_SYNTAX);
				377	break;
				378
				379
				380	//
				381	// Unary operands + ? *
				382	// These all appear after the operand to which they apply.
				383	// When we hit one, the operand (may be a whole sub expression)
				384	// will be on the top of the stack.
				385	// Unary Operator becomes TOS, with the old TOS as its one child.
				386	case doUnaryOpPlus:
				387	{
				388	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				389	RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
				390	plusNode->fLeftChild = operandNode;
				391	operandNode->fParent = plusNode;
				392	}
				393	break;
				394
				395	case doUnaryOpQuestion:
				396	{
				397	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				398	RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
				399	qNode->fLeftChild = operandNode;
				400	operandNode->fParent = qNode;
				401	}
				402	break;
				403
				404	case doUnaryOpStar:
				405	{
				406	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				407	RBBINode *starNode = pushNewNode(RBBINode::opStar);
				408	starNode->fLeftChild = operandNode;
				409	operandNode->fParent = starNode;
				410	}
				411	break;
				412
				413	case doRuleChar:
				414	// A "Rule Character" is any single character that is a literal part
				415	// of the regular expression. Like a, b and c in the expression "(abc*) \| [:L:]"
				416	// These are pretty uncommon in break rules; the terms are more commonly
				417	// sets. To keep things uniform, treat these characters like as
				418	// sets that just happen to contain only one character.
				419	{
				420	n = pushNewNode(RBBINode::setRef);
				421	findSetFor(UnicodeString(fC.fChar), n);
				422	n->fFirstPos = fScanIndex;
				423	n->fLastPos = fNextIndex;
				424	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				425	break;
				426	}
				427
				428	case doDotAny:
				429	// scanned a ".", meaning match any single character.
				430	{
				431	n = pushNewNode(RBBINode::setRef);
				432	findSetFor(UnicodeString(TRUE, kAny, 3), n);
				433	n->fFirstPos = fScanIndex;
				434	n->fLastPos = fNextIndex;
				435	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				436	break;
				437	}
				438
				439	case doSlash:
				440	// Scanned a '/', which identifies a look-ahead break position in a rule.
				441	n = pushNewNode(RBBINode::lookAhead);
				442	n->fVal = fRuleNum;
				443	n->fFirstPos = fScanIndex;
				444	n->fLastPos = fNextIndex;
				445	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				446	fLookAheadRule = TRUE;
				447	break;
				448
				449
				450	case doStartTagValue:
				451	// Scanned a '{', the opening delimiter for a tag value within a rule.
				452	n = pushNewNode(RBBINode::tag);
				453	n->fVal = 0;
				454	n->fFirstPos = fScanIndex;
				455	n->fLastPos = fNextIndex;
				456	break;
				457
				458	case doTagDigit:
				459	// Just scanned a decimal digit that's part of a tag value
				460	{
				461	n = fNodeStack[fNodeStackPtr];
				462	uint32_t v = u_charDigitValue(fC.fChar);
				463	U_ASSERT(v < 10);
				464	n->fVal = n->fVal*10 + v;
				465	break;
				466	}
				467
				468	case doTagValue:
				469	n = fNodeStack[fNodeStackPtr];
				470	n->fLastPos = fNextIndex;
				471	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				472	break;
				473
				474	case doTagExpectedError:
				475	error(U_BRK_MALFORMED_RULE_TAG);
				476	returnVal = FALSE;
				477	break;
				478
				479	case doOptionStart:
				480	// Scanning a !!option. At the start of string.
				481	fOptionStart = fScanIndex;
				482	break;
				483
				484	case doOptionEnd:
				485	{
				486	UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
				487	if (opt == UNICODE_STRING("chain", 5)) {
				488	fRB->fChainRules = TRUE;
				489	} else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
				490	fRB->fLBCMNoChain = TRUE;
				491	} else if (opt == UNICODE_STRING("forward", 7)) {
				492	fRB->fDefaultTree = &fRB->fForwardTree;
				493	} else if (opt == UNICODE_STRING("reverse", 7)) {
				494	fRB->fDefaultTree = &fRB->fReverseTree;
				495	} else if (opt == UNICODE_STRING("safe_forward", 12)) {
				496	fRB->fDefaultTree = &fRB->fSafeFwdTree;
				497	} else if (opt == UNICODE_STRING("safe_reverse", 12)) {
				498	fRB->fDefaultTree = &fRB->fSafeRevTree;
				499	} else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
				500	fRB->fLookAheadHardBreak = TRUE;
				501	} else {
				502	error(U_BRK_UNRECOGNIZED_OPTION);
				503	}
				504	}
				505	break;
				506
				507	case doReverseDir:
				508	fReverseRule = TRUE;
				509	break;
				510
				511	case doStartVariableName:
				512	n = pushNewNode(RBBINode::varRef);
				513	if (U_FAILURE(*fRB->fStatus)) {
				514	break;
				515	}
				516	n->fFirstPos = fScanIndex;
				517	break;
				518
				519	case doEndVariableName:
				520	n = fNodeStack[fNodeStackPtr];
				521	if (n==NULL \|\| n->fType != RBBINode::varRef) {
				522	error(U_BRK_INTERNAL_ERROR);
				523	break;
				524	}
				525	n->fLastPos = fScanIndex;
				526	fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
				527	// Look the newly scanned name up in the symbol table
				528	// If there's an entry, set the l. child of the var ref to the replacement expression.
				529	// (We also pass through here when scanning assignments, but no harm is done, other
				530	// than a slight wasted effort that seems hard to avoid. Lookup will be null)
				531	n->fLeftChild = fSymbolTable->lookupNode(n->fText);
				532	break;
				533
				534	case doCheckVarDef:
				535	n = fNodeStack[fNodeStackPtr];
				536	if (n->fLeftChild == NULL) {
				537	error(U_BRK_UNDEFINED_VARIABLE);
				538	returnVal = FALSE;
				539	}
				540	break;
				541
				542	case doExprFinished:
				543	break;
				544
				545	case doRuleErrorAssignExpr:
				546	error(U_BRK_ASSIGN_ERROR);
				547	returnVal = FALSE;
				548	break;
				549
				550	case doExit:
				551	returnVal = FALSE;
				552	break;
				553
				554	case doScanUnicodeSet:
				555	scanSet();
				556	break;
				557
				558	default:
				559	error(U_BRK_INTERNAL_ERROR);
				560	returnVal = FALSE;
				561	break;
				562	}
				563	return returnVal;
				564	}
				565
				566
				567
				568
				569	//------------------------------------------------------------------------------
				570	//
				571	// Error Report a rule parse error.
				572	// Only report it if no previous error has been recorded.
				573	//
				574	//------------------------------------------------------------------------------
				575	void RBBIRuleScanner::error(UErrorCode e) {
				576	if (U_SUCCESS(*fRB->fStatus)) {
				577	*fRB->fStatus = e;
				578	if (fRB->fParseError) {
				579	fRB->fParseError->line = fLineNum;
				580	fRB->fParseError->offset = fCharNum;
				581	fRB->fParseError->preContext[0] = 0;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	582	fRB->fParseError->postContext[0] = 0;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	583	}
				584	}
				585	}
				586
				587
				588
				589
				590	//------------------------------------------------------------------------------
				591	//
				592	// fixOpStack The parse stack holds partially assembled chunks of the parse tree.
				593	// An entry on the stack may be as small as a single setRef node,
				594	// or as large as the parse tree
				595	// for an entire expression (this will be the one item left on the stack
				596	// when the parsing of an RBBI rule completes.
				597	//
				598	// This function is called when a binary operator is encountered.
				599	// It looks back up the stack for operators that are not yet associated
				600	// with a right operand, and if the precedence of the stacked operator >=
				601	// the precedence of the current operator, binds the operand left,
				602	// to the previously encountered operator.
				603	//
				604	//------------------------------------------------------------------------------
				605	void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
				606	RBBINode *n;
				607	// printNodeStack("entering fixOpStack()");
				608	for (;;) {
				609	n = fNodeStack[fNodeStackPtr-1]; // an operator node
				610	if (n->fPrecedence == 0) {
				611	RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
				612	error(U_BRK_INTERNAL_ERROR);
				613	return;
				614	}
				615
				616	if (n->fPrecedence < p \|\| n->fPrecedence <= RBBINode::precLParen) {
				617	// The most recent operand goes with the current operator,
				618	// not with the previously stacked one.
				619	break;
				620	}
				621	// Stack operator is a binary op ( '\|' or concatenation)
				622	// TOS operand becomes right child of this operator.
				623	// Resulting subexpression becomes the TOS operand.
				624	n->fRightChild = fNodeStack[fNodeStackPtr];
				625	fNodeStack[fNodeStackPtr]->fParent = n;
				626	fNodeStackPtr--;
				627	// printNodeStack("looping in fixOpStack() ");
				628	}
				629
				630	if (p <= RBBINode::precLParen) {
				631	// Scan is at a right paren or end of expression.
				632	// The scanned item must match the stack, or else there was an error.
				633	// Discard the left paren (or start expr) node from the stack,
				634	// leaving the completed (sub)expression as TOS.
				635	if (n->fPrecedence != p) {
				636	// Right paren encountered matched start of expression node, or
				637	// end of expression matched with a left paren node.
				638	error(U_BRK_MISMATCHED_PAREN);
				639	}
				640	fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
				641	fNodeStackPtr--;
				642	// Delete the now-discarded LParen or Start node.
				643	delete n;
				644	}
				645	// printNodeStack("leaving fixOpStack()");
				646	}
				647
				648
				649
				650
				651	//------------------------------------------------------------------------------
				652	//
				653	// findSetFor given a UnicodeString,
				654	// - find the corresponding Unicode Set (uset node)
				655	// (create one if necessary)
				656	// - Set fLeftChild of the caller's node (should be a setRef node)
				657	// to the uset node
				658	// Maintain a hash table of uset nodes, so the same one is always used
				659	// for the same string.
				660	// If a "to adopt" set is provided and we haven't seen this key before,
				661	// add the provided set to the hash table.
				662	// If the string is one (32 bit) char in length, the set contains
				663	// just one element which is the char in question.
				664	// If the string is "any", return a set containing all chars.
				665	//
				666	//------------------------------------------------------------------------------
				667	void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode node, UnicodeSet setToAdopt) {
				668
				669	RBBISetTableEl *el;
				670
				671	// First check whether we've already cached a set for this string.
				672	// If so, just use the cached set in the new node.
				673	// delete any set provided by the caller, since we own it.
				674	el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
				675	if (el != NULL) {
				676	delete setToAdopt;
				677	node->fLeftChild = el->val;
				678	U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
				679	return;
				680	}
				681
				682	// Haven't seen this set before.
				683	// If the caller didn't provide us with a prebuilt set,
				684	// create a new UnicodeSet now.
				685	if (setToAdopt == NULL) {
				686	if (s.compare(kAny, -1) == 0) {
				687	setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
				688	} else {
				689	UChar32 c;
				690	c = s.char32At(0);
				691	setToAdopt = new UnicodeSet(c, c);
				692	}
				693	}
				694
				695	//
				696	// Make a new uset node to refer to this UnicodeSet
				697	// This new uset node becomes the child of the caller's setReference node.
				698	//
				699	RBBINode *usetNode = new RBBINode(RBBINode::uset);
				700	if (usetNode == NULL) {
				701	error(U_MEMORY_ALLOCATION_ERROR);
				702	return;
				703	}
				704	usetNode->fInputSet = setToAdopt;
				705	usetNode->fParent = node;
				706	node->fLeftChild = usetNode;
				707	usetNode->fText = s;
				708
				709
				710	//
				711	// Add the new uset node to the list of all uset nodes.
				712	//
				713	fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
				714
				715
				716	//
				717	// Add the new set to the set hash table.
				718	//
				719	el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
				720	UnicodeString *tkey = new UnicodeString(s);
				721	if (tkey == NULL \|\| el == NULL \|\| setToAdopt == NULL) {
				722	// Delete to avoid memory leak
				723	delete tkey;
				724	tkey = NULL;
				725	uprv_free(el);
				726	el = NULL;
				727	delete setToAdopt;
				728	setToAdopt = NULL;
				729
				730	error(U_MEMORY_ALLOCATION_ERROR);
				731	return;
				732	}
				733	el->key = tkey;
				734	el->val = usetNode;
				735	uhash_put(fSetTable, el->key, el, fRB->fStatus);
				736
				737	return;
				738	}
				739
				740
				741
				742	//
				743	// Assorted Unicode character constants.
				744	// Numeric because there is no portable way to enter them as literals.
				745	// (Think EBCDIC).
				746	//
				747	static const UChar chCR = 0x0d; // New lines, for terminating comments.
				748	static const UChar chLF = 0x0a;
				749	static const UChar chNEL = 0x85; // NEL newline variant
				750	static const UChar chLS = 0x2028; // Unicode Line Separator
				751	static const UChar chApos = 0x27; // single quote, for quoted chars.
				752	static const UChar chPound = 0x23; // '#', introduces a comment.
				753	static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
				754	static const UChar chLParen = 0x28;
				755	static const UChar chRParen = 0x29;
				756
				757
				758	//------------------------------------------------------------------------------
				759	//
				760	// stripRules Return a rules string without unnecessary
				761	// characters.
				762	//
				763	//------------------------------------------------------------------------------
				764	UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
				765	UnicodeString strippedRules;
				766	int rulesLength = rules.length();
				767	for (int idx = 0; idx < rulesLength; ) {
				768	UChar ch = rules[idx++];
				769	if (ch == chPound) {
				770	while (idx < rulesLength
				771	&& ch != chCR && ch != chLF && ch != chNEL)
				772	{
				773	ch = rules[idx++];
				774	}
				775	}
				776	if (!u_isISOControl(ch)) {
				777	strippedRules.append(ch);
				778	}
				779	}
				780	// strippedRules = strippedRules.unescape();
				781	return strippedRules;
				782	}
				783
				784
				785	//------------------------------------------------------------------------------
				786	//
				787	// nextCharLL Low Level Next Char from rule input source.
				788	// Get a char from the input character iterator,
				789	// keep track of input position for error reporting.
				790	//
				791	//------------------------------------------------------------------------------
				792	UChar32 RBBIRuleScanner::nextCharLL() {
				793	UChar32 ch;
				794
				795	if (fNextIndex >= fRB->fRules.length()) {
				796	return (UChar32)-1;
				797	}
				798	ch = fRB->fRules.char32At(fNextIndex);
				799	fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
				800
				801	if (ch == chCR \|\|
				802	ch == chNEL \|\|
				803	ch == chLS \|\|
				804	(ch == chLF && fLastChar != chCR)) {
				805	// Character is starting a new line. Bump up the line number, and
				806	// reset the column to 0.
				807	fLineNum++;
				808	fCharNum=0;
				809	if (fQuoteMode) {
				810	error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
				811	fQuoteMode = FALSE;
				812	}
				813	}
				814	else {
				815	// Character is not starting a new line. Except in the case of a
				816	// LF following a CR, increment the column position.
				817	if (ch != chLF) {
				818	fCharNum++;
				819	}
				820	}
				821	fLastChar = ch;
				822	return ch;
				823	}
				824
				825
				826	//------------------------------------------------------------------------------
				827	//
				828	// nextChar for rules scanning. At this level, we handle stripping
				829	// out comments and processing backslash character escapes.
				830	// The rest of the rules grammar is handled at the next level up.
				831	//
				832	//------------------------------------------------------------------------------
				833	void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
				834
				835	// Unicode Character constants needed for the processing done by nextChar(),
				836	// in hex because literals wont work on EBCDIC machines.
				837
				838	fScanIndex = fNextIndex;
				839	c.fChar = nextCharLL();
				840	c.fEscaped = FALSE;
				841
				842	//
				843	// check for '' sequence.
				844	// These are recognized in all contexts, whether in quoted text or not.
				845	//
				846	if (c.fChar == chApos) {
				847	if (fRB->fRules.char32At(fNextIndex) == chApos) {
				848	c.fChar = nextCharLL(); // get nextChar officially so character counts
				849	c.fEscaped = TRUE; // stay correct.
				850	}
				851	else
				852	{
				853	// Single quote, by itself.
				854	// Toggle quoting mode.
				855	// Return either '(' or ')', because quotes cause a grouping of the quoted text.
				856	fQuoteMode = !fQuoteMode;
				857	if (fQuoteMode == TRUE) {
				858	c.fChar = chLParen;
				859	} else {
				860	c.fChar = chRParen;
				861	}
				862	c.fEscaped = FALSE; // The paren that we return is not escaped.
				863	return;
				864	}
				865	}
				866
				867	if (fQuoteMode) {
				868	c.fEscaped = TRUE;
				869	}
				870	else
				871	{
				872	// We are not in a 'quoted region' of the source.
				873	//
				874	if (c.fChar == chPound) {
				875	// Start of a comment. Consume the rest of it.
				876	// The new-line char that terminates the comment is always returned.
				877	// It will be treated as white-space, and serves to break up anything
				878	// that might otherwise incorrectly clump together with a comment in
				879	// the middle (a variable name, for example.)
				880	for (;;) {
				881	c.fChar = nextCharLL();
				882	if (c.fChar == (UChar32)-1 \|\| // EOF
				883	c.fChar == chCR \|\|
				884	c.fChar == chLF \|\|
				885	c.fChar == chNEL \|\|
				886	c.fChar == chLS) {break;}
				887	}
				888	}
				889	if (c.fChar == (UChar32)-1) {
				890	return;
				891	}
				892
				893	//
				894	// check for backslash escaped characters.
				895	// Use UnicodeString::unescapeAt() to handle them.
				896	//
				897	if (c.fChar == chBackSlash) {
				898	c.fEscaped = TRUE;
				899	int32_t startX = fNextIndex;
				900	c.fChar = fRB->fRules.unescapeAt(fNextIndex);
				901	if (fNextIndex == startX) {
				902	error(U_BRK_HEX_DIGITS_EXPECTED);
				903	}
				904	fCharNum += fNextIndex-startX;
				905	}
				906	}
				907	// putc(c.fChar, stdout);
				908	}
				909
				910	//------------------------------------------------------------------------------
				911	//
				912	// Parse RBBI rules. The state machine for rules parsing is here.
				913	// The state tables are hand-written in the file rbbirpt.txt,
				914	// and converted to the form used here by a perl
				915	// script rbbicst.pl
				916	//
				917	//------------------------------------------------------------------------------
				918	void RBBIRuleScanner::parse() {
				919	uint16_t state;
				920	const RBBIRuleTableEl *tableEl;
				921
				922	if (U_FAILURE(*fRB->fStatus)) {
				923	return;
				924	}
				925
				926	state = 1;
				927	nextChar(fC);
				928	//
				929	// Main loop for the rule parsing state machine.
				930	// Runs once per state transition.
				931	// Each time through optionally performs, depending on the state table,
				932	// - an advance to the the next input char
				933	// - an action to be performed.
				934	// - pushing or popping a state to/from the local state return stack.
				935	//
				936	for (;;) {
				937	// Bail out if anything has gone wrong.
				938	// RBBI rule file parsing stops on the first error encountered.
				939	if (U_FAILURE(*fRB->fStatus)) {
				940	break;
				941	}
				942
				943	// Quit if state == 0. This is the normal way to exit the state machine.
				944	//
				945	if (state == 0) {
				946	break;
				947	}
				948
				949	// Find the state table element that matches the input char from the rule, or the
				950	// class of the input character. Start with the first table row for this
				951	// state, then linearly scan forward until we find a row that matches the
				952	// character. The last row for each state always matches all characters, so
				953	// the search will stop there, if not before.
				954	//
				955	tableEl = &gRuleParseStateTable[state];
				956	#ifdef RBBI_DEBUG
				957	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
				958	RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
				959	fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
				960	}
				961	#endif
				962
				963	for (;;) {
				964	#ifdef RBBI_DEBUG
				965	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
				966	#endif
				967	if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
				968	// Table row specified an individual character, not a set, and
				969	// the input character is not escaped, and
				970	// the input character matched it.
				971	break;
				972	}
				973	if (tableEl->fCharClass == 255) {
				974	// Table row specified default, match anything character class.
				975	break;
				976	}
				977	if (tableEl->fCharClass == 254 && fC.fEscaped) {
				978	// Table row specified "escaped" and the char was escaped.
				979	break;
				980	}
				981	if (tableEl->fCharClass == 253 && fC.fEscaped &&
				982	(fC.fChar == 0x50 \|\| fC.fChar == 0x70 )) {
				983	// Table row specified "escaped P" and the char is either 'p' or 'P'.
				984	break;
				985	}
				986	if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {
				987	// Table row specified eof and we hit eof on the input.
				988	break;
				989	}
				990
				991	if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
				992	fC.fEscaped == FALSE && // char is not escaped &&
				993	fC.fChar != (UChar32)-1) { // char is not EOF
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	994	U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	995	if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
				996	// Table row specified a character class, or set of characters,
				997	// and the current char matches it.
				998	break;
				999	}
				1000	}
				1001
				1002	// No match on this row, advance to the next row for this state,
				1003	tableEl++;
				1004	}
				1005	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
				1006
				1007	//
				1008	// We've found the row of the state table that matches the current input
				1009	// character from the rules string.
				1010	// Perform any action specified by this row in the state table.
				1011	if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
				1012	// Break out of the state machine loop if the
				1013	// the action signalled some kind of error, or
				1014	// the action was to exit, occurs on normal end-of-rules-input.
				1015	break;
				1016	}
				1017
				1018	if (tableEl->fPushState != 0) {
				1019	fStackPtr++;
				1020	if (fStackPtr >= kStackSize) {
				1021	error(U_BRK_INTERNAL_ERROR);
				1022	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
				1023	fStackPtr--;
				1024	}
				1025	fStack[fStackPtr] = tableEl->fPushState;
				1026	}
				1027
				1028	if (tableEl->fNextChar) {
				1029	nextChar(fC);
				1030	}
				1031
				1032	// Get the next state from the table entry, or from the
				1033	// state stack if the next state was specified as "pop".
				1034	if (tableEl->fNextState != 255) {
				1035	state = tableEl->fNextState;
				1036	} else {
				1037	state = fStack[fStackPtr];
				1038	fStackPtr--;
				1039	if (fStackPtr < 0) {
				1040	error(U_BRK_INTERNAL_ERROR);
				1041	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
				1042	fStackPtr++;
				1043	}
				1044	}
				1045
				1046	}
				1047
				1048	//
				1049	// If there were NO user specified reverse rules, set up the equivalent of ".*;"
				1050	//
				1051	if (fRB->fReverseTree == NULL) {
				1052	fRB->fReverseTree = pushNewNode(RBBINode::opStar);
				1053	RBBINode *operand = pushNewNode(RBBINode::setRef);
				1054	findSetFor(UnicodeString(TRUE, kAny, 3), operand);
				1055	fRB->fReverseTree->fLeftChild = operand;
				1056	operand->fParent = fRB->fReverseTree;
				1057	fNodeStackPtr -= 2;
				1058	}
				1059
				1060
				1061	//
				1062	// Parsing of the input RBBI rules is complete.
				1063	// We now have a parse tree for the rule expressions
				1064	// and a list of all UnicodeSets that are referenced.
				1065	//
				1066	#ifdef RBBI_DEBUG
				1067	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
				1068	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
				1069	{
				1070	RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
				1071	fRB->fForwardTree->printTree(TRUE);
				1072	RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
				1073	fRB->fReverseTree->printTree(TRUE);
				1074	RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
				1075	fRB->fSafeFwdTree->printTree(TRUE);
				1076	RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
				1077	fRB->fSafeRevTree->printTree(TRUE);
				1078	}
				1079	#endif
				1080	}
				1081
				1082
				1083	//------------------------------------------------------------------------------
				1084	//
				1085	// printNodeStack for debugging...
				1086	//
				1087	//------------------------------------------------------------------------------
				1088	#ifdef RBBI_DEBUG
				1089	void RBBIRuleScanner::printNodeStack(const char *title) {
				1090	int i;
				1091	RBBIDebugPrintf("%s. Dumping node stack...\n", title);
				1092	for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}
				1093	}
				1094	#endif
				1095
				1096
				1097
				1098
				1099	//------------------------------------------------------------------------------
				1100	//
				1101	// pushNewNode create a new RBBINode of the specified type and push it
				1102	// onto the stack of nodes.
				1103	//
				1104	//------------------------------------------------------------------------------
				1105	RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
				1106	fNodeStackPtr++;
				1107	if (fNodeStackPtr >= kStackSize) {
				1108	error(U_BRK_INTERNAL_ERROR);
				1109	RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
				1110	*fRB->fStatus = U_BRK_INTERNAL_ERROR;
				1111	return NULL;
				1112	}
				1113	fNodeStack[fNodeStackPtr] = new RBBINode(t);
				1114	if (fNodeStack[fNodeStackPtr] == NULL) {
				1115	*fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
				1116	}
				1117	return fNodeStack[fNodeStackPtr];
				1118	}
				1119
				1120
				1121
				1122	//------------------------------------------------------------------------------
				1123	//
				1124	// scanSet Construct a UnicodeSet from the text at the current scan
				1125	// position. Advance the scan position to the first character
				1126	// after the set.
				1127	//
				1128	// A new RBBI setref node referring to the set is pushed onto the node
				1129	// stack.
				1130	//
				1131	// The scan position is normally under the control of the state machine
				1132	// that controls rule parsing. UnicodeSets, however, are parsed by
				1133	// the UnicodeSet constructor, not by the RBBI rule parser.
				1134	//
				1135	//------------------------------------------------------------------------------
				1136	void RBBIRuleScanner::scanSet() {
				1137	UnicodeSet *uset;
				1138	ParsePosition pos;
				1139	int startPos;
				1140	int i;
				1141
				1142	if (U_FAILURE(*fRB->fStatus)) {
				1143	return;
				1144	}
				1145
				1146	pos.setIndex(fScanIndex);
				1147	startPos = fScanIndex;
				1148	UErrorCode localStatus = U_ZERO_ERROR;
				1149	uset = new UnicodeSet();
				1150	if (uset == NULL) {
				1151	localStatus = U_MEMORY_ALLOCATION_ERROR;
				1152	} else {
				1153	uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
				1154	}
				1155	if (U_FAILURE(localStatus)) {
				1156	// TODO: Get more accurate position of the error from UnicodeSet's return info.
				1157	// UnicodeSet appears to not be reporting correctly at this time.
				1158	#ifdef RBBI_DEBUG
				1159	RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
				1160	#endif
				1161	error(localStatus);
				1162	delete uset;
				1163	return;
				1164	}
				1165
				1166	// Verify that the set contains at least one code point.
				1167	//
				1168	U_ASSERT(uset!=NULL);
				1169	if (uset->isEmpty()) {
				1170	// This set is empty.
				1171	// Make it an error, because it almost certainly is not what the user wanted.
				1172	// Also, avoids having to think about corner cases in the tree manipulation code
				1173	// that occurs later on.
				1174	error(U_BRK_RULE_EMPTY_SET);
				1175	delete uset;
				1176	return;
				1177	}
				1178
				1179
				1180	// Advance the RBBI parse postion over the UnicodeSet pattern.
				1181	// Don't just set fScanIndex because the line/char positions maintained
				1182	// for error reporting would be thrown off.
				1183	i = pos.getIndex();
				1184	for (;;) {
				1185	if (fNextIndex >= i) {
				1186	break;
				1187	}
				1188	nextCharLL();
				1189	}
				1190
				1191	if (U_SUCCESS(*fRB->fStatus)) {
				1192	RBBINode *n;
				1193
				1194	n = pushNewNode(RBBINode::setRef);
				1195	n->fFirstPos = startPos;
				1196	n->fLastPos = fNextIndex;
				1197	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				1198	// findSetFor() serves several purposes here:
				1199	// - Adopts storage for the UnicodeSet, will be responsible for deleting.
				1200	// - Mantains collection of all sets in use, needed later for establishing
				1201	// character categories for run time engine.
				1202	// - Eliminates mulitiple instances of the same set.
				1203	// - Creates a new uset node if necessary (if this isn't a duplicate.)
				1204	findSetFor(n->fText, n, uset);
				1205	}
				1206
				1207	}
				1208
				1209	U_NAMESPACE_END
				1210
				1211	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */