Blame - source/common/rbbiscan.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 1dc6b704d01094c7233e9d3d5f3b19566b7d5b3e [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1	//
				2	// file: rbbiscan.cpp
				3	//
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	4	// Copyright (C) 2002-2015, International Business Machines Corporation and others.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	5	// All Rights Reserved.
				6	//
				7	// This file contains the Rule Based Break Iterator Rule Builder functions for
				8	// scanning the rules and assembling a parse tree. This is the first phase
				9	// of compiling the rules.
				10	//
				11	// The overall of the rules is managed by class RBBIRuleBuilder, which will
				12	// create and use an instance of this class as part of the process.
				13	//
				14
				15	#include "unicode/utypes.h"
				16
				17	#if !UCONFIG_NO_BREAK_ITERATION
				18
				19	#include "unicode/unistr.h"
				20	#include "unicode/uniset.h"
				21	#include "unicode/uchar.h"
				22	#include "unicode/uchriter.h"
				23	#include "unicode/parsepos.h"
				24	#include "unicode/parseerr.h"
				25	#include "cmemory.h"
				26	#include "cstring.h"
				27
				28	#include "rbbirpt.h" // Contains state table for the rbbi rules parser.
				29	// generated by a Perl script.
				30	#include "rbbirb.h"
				31	#include "rbbinode.h"
				32	#include "rbbiscan.h"
				33	#include "rbbitblb.h"
				34
				35	#include "uassert.h"
				36
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	37	//------------------------------------------------------------------------------
				38	//
				39	// Unicode Set init strings for each of the character classes needed for parsing a rule file.
				40	// (Initialized with hex values for portability to EBCDIC based machines.
				41	// Really ugly, but there's no good way to avoid it.)
				42	//
				43	// The sets are referred to by name in the rbbirpt.txt, which is the
				44	// source form of the state transition table for the RBBI rule parser.
				45	//
				46	//------------------------------------------------------------------------------
				47	static const UChar gRuleSet_rule_char_pattern[] = {
				48	// [ ^ [ \ p { Z } \ u 0 0 2 0
				49	0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
				50	// - \ u 0 0 7 f ] - [ \ p
				51	0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
				52	// { L } ] - [ \ p { N } ] ]
				53	0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
				54
				55	static const UChar gRuleSet_name_char_pattern[] = {
				56	// [ _ \ p { L } \ p { N } ]
				57	0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
				58
				59	static const UChar gRuleSet_digit_char_pattern[] = {
				60	// [ 0 - 9 ]
				61	0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
				62
				63	static const UChar gRuleSet_name_start_char_pattern[] = {
				64	// [ _ \ p { L } ]
				65	0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
				66
				67	static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
				68
				69
				70	U_CDECL_BEGIN
				71	static void U_CALLCONV RBBISetTable_deleter(void *p) {
				72	icu::RBBISetTableEl px = (icu::RBBISetTableEl )p;
				73	delete px->key;
				74	// Note: px->val is owned by the linked list "fSetsListHead" in scanner.
				75	// Don't delete the value nodes here.
				76	uprv_free(px);
				77	}
				78	U_CDECL_END
				79
				80	U_NAMESPACE_BEGIN
				81
				82	//------------------------------------------------------------------------------
				83	//
				84	// Constructor.
				85	//
				86	//------------------------------------------------------------------------------
				87	RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
				88	{
				89	fRB = rb;
				90	fStackPtr = 0;
				91	fStack[fStackPtr] = 0;
				92	fNodeStackPtr = 0;
				93	fRuleNum = 0;
				94	fNodeStack[0] = NULL;
				95
				96	fSymbolTable = NULL;
				97	fSetTable = NULL;
				98
				99	fScanIndex = 0;
				100	fNextIndex = 0;
				101
				102	fReverseRule = FALSE;
				103	fLookAheadRule = FALSE;
				104
				105	fLineNum = 1;
				106	fCharNum = 0;
				107	fQuoteMode = FALSE;
				108
				109	// Do not check status until after all critical fields are sufficiently initialized
				110	// that the destructor can run cleanly.
				111	if (U_FAILURE(*rb->fStatus)) {
				112	return;
				113	}
				114
				115	//
				116	// Set up the constant Unicode Sets.
				117	// Note: These could be made static, lazily initialized, and shared among
				118	// all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
				119	// and the time to build these few sets should be small compared to a
				120	// full break iterator build.
				121	fRuleSets[kRuleSet_rule_char-128]
				122	= UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus);
				123	// fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
				124	fRuleSets[kRuleSet_white_space-128].
				125	add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
				126	fRuleSets[kRuleSet_name_char-128]
				127	= UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);
				128	fRuleSets[kRuleSet_name_start_char-128]
				129	= UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
				130	fRuleSets[kRuleSet_digit_char-128]
				131	= UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus);
				132	if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
				133	// This case happens if ICU's data is missing. UnicodeSet tries to look up property
				134	// names from the init string, can't find them, and claims an illegal argument.
				135	// Change the error so that the actual problem will be clearer to users.
				136	*rb->fStatus = U_BRK_INIT_ERROR;
				137	}
				138	if (U_FAILURE(*rb->fStatus)) {
				139	return;
				140	}
				141
				142	fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
				143	if (fSymbolTable == NULL) {
				144	*rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
				145	return;
				146	}
				147	fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);
				148	if (U_FAILURE(*rb->fStatus)) {
				149	return;
				150	}
				151	uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
				152	}
				153
				154
				155
				156	//------------------------------------------------------------------------------
				157	//
				158	// Destructor
				159	//
				160	//------------------------------------------------------------------------------
				161	RBBIRuleScanner::~RBBIRuleScanner() {
				162	delete fSymbolTable;
				163	if (fSetTable != NULL) {
				164	uhash_close(fSetTable);
				165	fSetTable = NULL;
				166
				167	}
				168
				169
				170	// Node Stack.
				171	// Normally has one entry, which is the entire parse tree for the rules.
				172	// If errors occured, there may be additional subtrees left on the stack.
				173	while (fNodeStackPtr > 0) {
				174	delete fNodeStack[fNodeStackPtr];
				175	fNodeStackPtr--;
				176	}
				177
				178	}
				179
				180	//------------------------------------------------------------------------------
				181	//
				182	// doParseAction Do some action during rule parsing.
				183	// Called by the parse state machine.
				184	// Actions build the parse tree and Unicode Sets,
				185	// and maintain the parse stack for nested expressions.
				186	//
				187	// TODO: unify EParseAction and RBBI_RuleParseAction enum types.
				188	// They represent exactly the same thing. They're separate
				189	// only to work around enum forward declaration restrictions
				190	// in some compilers, while at the same time avoiding multiple
				191	// definitions problems. I'm sure that there's a better way.
				192	//
				193	//------------------------------------------------------------------------------
				194	UBool RBBIRuleScanner::doParseActions(int32_t action)
				195	{
				196	RBBINode *n = NULL;
				197
				198	UBool returnVal = TRUE;
				199
				200	switch (action) {
				201
				202	case doExprStart:
				203	pushNewNode(RBBINode::opStart);
				204	fRuleNum++;
				205	break;
				206
				207
				208	case doExprOrOperator:
				209	{
				210	fixOpStack(RBBINode::precOpCat);
				211	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				212	RBBINode *orNode = pushNewNode(RBBINode::opOr);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	213	if (U_FAILURE(*fRB->fStatus)) {
				214	break;
				215	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	216	orNode->fLeftChild = operandNode;
				217	operandNode->fParent = orNode;
				218	}
				219	break;
				220
				221	case doExprCatOperator:
				222	// concatenation operator.
				223	// For the implicit concatenation of adjacent terms in an expression that are
				224	// not separated by any other operator. Action is invoked between the
				225	// actions for the two terms.
				226	{
				227	fixOpStack(RBBINode::precOpCat);
				228	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				229	RBBINode *catNode = pushNewNode(RBBINode::opCat);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	230	if (U_FAILURE(*fRB->fStatus)) {
				231	break;
				232	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	233	catNode->fLeftChild = operandNode;
				234	operandNode->fParent = catNode;
				235	}
				236	break;
				237
				238	case doLParen:
				239	// Open Paren.
				240	// The openParen node is a dummy operation type with a low precedence,
				241	// which has the affect of ensuring that any real binary op that
				242	// follows within the parens binds more tightly to the operands than
				243	// stuff outside of the parens.
				244	pushNewNode(RBBINode::opLParen);
				245	break;
				246
				247	case doExprRParen:
				248	fixOpStack(RBBINode::precLParen);
				249	break;
				250
				251	case doNOP:
				252	break;
				253
				254	case doStartAssign:
				255	// We've just scanned "$variable = "
				256	// The top of the node stack has the $variable ref node.
				257
				258	// Save the start position of the RHS text in the StartExpression node
				259	// that precedes the $variableReference node on the stack.
				260	// This will eventually be used when saving the full $variable replacement
				261	// text as a string.
				262	n = fNodeStack[fNodeStackPtr-1];
				263	n->fFirstPos = fNextIndex; // move past the '='
				264
				265	// Push a new start-of-expression node; needed to keep parse of the
				266	// RHS expression happy.
				267	pushNewNode(RBBINode::opStart);
				268	break;
				269
				270
				271
				272
				273	case doEndAssign:
				274	{
				275	// We have reached the end of an assignement statement.
				276	// Current scan char is the ';' that terminates the assignment.
				277
				278	// Terminate expression, leaves expression parse tree rooted in TOS node.
				279	fixOpStack(RBBINode::precStart);
				280
				281	RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];
				282	RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];
				283	RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
				284
				285	// Save original text of right side of assignment, excluding the terminating ';'
				286	// in the root of the node for the right-hand-side expression.
				287	RHSExprNode->fFirstPos = startExprNode->fFirstPos;
				288	RHSExprNode->fLastPos = fScanIndex;
				289	fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
				290
				291	// Expression parse tree becomes l. child of the $variable reference node.
				292	varRefNode->fLeftChild = RHSExprNode;
				293	RHSExprNode->fParent = varRefNode;
				294
				295	// Make a symbol table entry for the $variableRef node.
				296	fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
				297	if (U_FAILURE(*fRB->fStatus)) {
				298	// This is a round-about way to get the parse position set
				299	// so that duplicate symbols error messages include a line number.
				300	UErrorCode t = *fRB->fStatus;
				301	*fRB->fStatus = U_ZERO_ERROR;
				302	error(t);
				303	}
				304
				305	// Clean up the stack.
				306	delete startExprNode;
				307	fNodeStackPtr-=3;
				308	break;
				309	}
				310
				311	case doEndOfRule:
				312	{
				313	fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
				314	if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node.
				315	break;
				316	}
				317	#ifdef RBBI_DEBUG
				318	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
				319	#endif
				320	U_ASSERT(fNodeStackPtr == 1);
				321
				322	// If this rule includes a look-ahead '/', add a endMark node to the
				323	// expression tree.
				324	if (fLookAheadRule) {
				325	RBBINode *thisRule = fNodeStack[fNodeStackPtr];
				326	RBBINode *endNode = pushNewNode(RBBINode::endMark);
				327	RBBINode *catNode = pushNewNode(RBBINode::opCat);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	328	if (U_FAILURE(*fRB->fStatus)) {
				329	break;
				330	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	331	fNodeStackPtr -= 2;
				332	catNode->fLeftChild = thisRule;
				333	catNode->fRightChild = endNode;
				334	fNodeStack[fNodeStackPtr] = catNode;
				335	endNode->fVal = fRuleNum;
				336	endNode->fLookAheadEnd = TRUE;
				337	}
				338
				339	// All rule expressions are ORed together.
				340	// The ';' that terminates an expression really just functions as a '\|' with
				341	// a low operator prededence.
				342	//
				343	// Each of the four sets of rules are collected separately.
				344	// (forward, reverse, safe_forward, safe_reverse)
				345	// OR this rule into the appropriate group of them.
				346	//
				347	RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
				348
				349	if (*destRules != NULL) {
				350	// This is not the first rule encounted.
				351	// OR previous stuff (from *destRules)
				352	// with the current rule expression (on the Node Stack)
				353	// with the resulting OR expression going to *destRules
				354	//
				355	RBBINode *thisRule = fNodeStack[fNodeStackPtr];
				356	RBBINode prevRules = destRules;
				357	RBBINode *orNode = pushNewNode(RBBINode::opOr);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	358	if (U_FAILURE(*fRB->fStatus)) {
				359	break;
				360	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	361	orNode->fLeftChild = prevRules;
				362	prevRules->fParent = orNode;
				363	orNode->fRightChild = thisRule;
				364	thisRule->fParent = orNode;
				365	*destRules = orNode;
				366	}
				367	else
				368	{
				369	// This is the first rule encountered (for this direction).
				370	// Just move its parse tree from the stack to *destRules.
				371	*destRules = fNodeStack[fNodeStackPtr];
				372	}
				373	fReverseRule = FALSE; // in preparation for the next rule.
				374	fLookAheadRule = FALSE;
				375	fNodeStackPtr = 0;
				376	}
				377	break;
				378
				379
				380	case doRuleError:
				381	error(U_BRK_RULE_SYNTAX);
				382	returnVal = FALSE;
				383	break;
				384
				385
				386	case doVariableNameExpectedErr:
				387	error(U_BRK_RULE_SYNTAX);
				388	break;
				389
				390
				391	//
				392	// Unary operands + ? *
				393	// These all appear after the operand to which they apply.
				394	// When we hit one, the operand (may be a whole sub expression)
				395	// will be on the top of the stack.
				396	// Unary Operator becomes TOS, with the old TOS as its one child.
				397	case doUnaryOpPlus:
				398	{
				399	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				400	RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	401	if (U_FAILURE(*fRB->fStatus)) {
				402	break;
				403	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	404	plusNode->fLeftChild = operandNode;
				405	operandNode->fParent = plusNode;
				406	}
				407	break;
				408
				409	case doUnaryOpQuestion:
				410	{
				411	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				412	RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	413	if (U_FAILURE(*fRB->fStatus)) {
				414	break;
				415	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	416	qNode->fLeftChild = operandNode;
				417	operandNode->fParent = qNode;
				418	}
				419	break;
				420
				421	case doUnaryOpStar:
				422	{
				423	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
				424	RBBINode *starNode = pushNewNode(RBBINode::opStar);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	425	if (U_FAILURE(*fRB->fStatus)) {
				426	break;
				427	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	428	starNode->fLeftChild = operandNode;
				429	operandNode->fParent = starNode;
				430	}
				431	break;
				432
				433	case doRuleChar:
				434	// A "Rule Character" is any single character that is a literal part
				435	// of the regular expression. Like a, b and c in the expression "(abc*) \| [:L:]"
				436	// These are pretty uncommon in break rules; the terms are more commonly
				437	// sets. To keep things uniform, treat these characters like as
				438	// sets that just happen to contain only one character.
				439	{
				440	n = pushNewNode(RBBINode::setRef);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	441	if (U_FAILURE(*fRB->fStatus)) {
				442	break;
				443	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	444	findSetFor(UnicodeString(fC.fChar), n);
				445	n->fFirstPos = fScanIndex;
				446	n->fLastPos = fNextIndex;
				447	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				448	break;
				449	}
				450
				451	case doDotAny:
				452	// scanned a ".", meaning match any single character.
				453	{
				454	n = pushNewNode(RBBINode::setRef);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	455	if (U_FAILURE(*fRB->fStatus)) {
				456	break;
				457	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	458	findSetFor(UnicodeString(TRUE, kAny, 3), n);
				459	n->fFirstPos = fScanIndex;
				460	n->fLastPos = fNextIndex;
				461	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				462	break;
				463	}
				464
				465	case doSlash:
				466	// Scanned a '/', which identifies a look-ahead break position in a rule.
				467	n = pushNewNode(RBBINode::lookAhead);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	468	if (U_FAILURE(*fRB->fStatus)) {
				469	break;
				470	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	471	n->fVal = fRuleNum;
				472	n->fFirstPos = fScanIndex;
				473	n->fLastPos = fNextIndex;
				474	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				475	fLookAheadRule = TRUE;
				476	break;
				477
				478
				479	case doStartTagValue:
				480	// Scanned a '{', the opening delimiter for a tag value within a rule.
				481	n = pushNewNode(RBBINode::tag);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	482	if (U_FAILURE(*fRB->fStatus)) {
				483	break;
				484	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	485	n->fVal = 0;
				486	n->fFirstPos = fScanIndex;
				487	n->fLastPos = fNextIndex;
				488	break;
				489
				490	case doTagDigit:
				491	// Just scanned a decimal digit that's part of a tag value
				492	{
				493	n = fNodeStack[fNodeStackPtr];
				494	uint32_t v = u_charDigitValue(fC.fChar);
				495	U_ASSERT(v < 10);
				496	n->fVal = n->fVal*10 + v;
				497	break;
				498	}
				499
				500	case doTagValue:
				501	n = fNodeStack[fNodeStackPtr];
				502	n->fLastPos = fNextIndex;
				503	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				504	break;
				505
				506	case doTagExpectedError:
				507	error(U_BRK_MALFORMED_RULE_TAG);
				508	returnVal = FALSE;
				509	break;
				510
				511	case doOptionStart:
				512	// Scanning a !!option. At the start of string.
				513	fOptionStart = fScanIndex;
				514	break;
				515
				516	case doOptionEnd:
				517	{
				518	UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
				519	if (opt == UNICODE_STRING("chain", 5)) {
				520	fRB->fChainRules = TRUE;
				521	} else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
				522	fRB->fLBCMNoChain = TRUE;
				523	} else if (opt == UNICODE_STRING("forward", 7)) {
				524	fRB->fDefaultTree = &fRB->fForwardTree;
				525	} else if (opt == UNICODE_STRING("reverse", 7)) {
				526	fRB->fDefaultTree = &fRB->fReverseTree;
				527	} else if (opt == UNICODE_STRING("safe_forward", 12)) {
				528	fRB->fDefaultTree = &fRB->fSafeFwdTree;
				529	} else if (opt == UNICODE_STRING("safe_reverse", 12)) {
				530	fRB->fDefaultTree = &fRB->fSafeRevTree;
				531	} else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
				532	fRB->fLookAheadHardBreak = TRUE;
				533	} else {
				534	error(U_BRK_UNRECOGNIZED_OPTION);
				535	}
				536	}
				537	break;
				538
				539	case doReverseDir:
				540	fReverseRule = TRUE;
				541	break;
				542
				543	case doStartVariableName:
				544	n = pushNewNode(RBBINode::varRef);
				545	if (U_FAILURE(*fRB->fStatus)) {
				546	break;
				547	}
				548	n->fFirstPos = fScanIndex;
				549	break;
				550
				551	case doEndVariableName:
				552	n = fNodeStack[fNodeStackPtr];
				553	if (n==NULL \|\| n->fType != RBBINode::varRef) {
				554	error(U_BRK_INTERNAL_ERROR);
				555	break;
				556	}
				557	n->fLastPos = fScanIndex;
				558	fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
				559	// Look the newly scanned name up in the symbol table
				560	// If there's an entry, set the l. child of the var ref to the replacement expression.
				561	// (We also pass through here when scanning assignments, but no harm is done, other
				562	// than a slight wasted effort that seems hard to avoid. Lookup will be null)
				563	n->fLeftChild = fSymbolTable->lookupNode(n->fText);
				564	break;
				565
				566	case doCheckVarDef:
				567	n = fNodeStack[fNodeStackPtr];
				568	if (n->fLeftChild == NULL) {
				569	error(U_BRK_UNDEFINED_VARIABLE);
				570	returnVal = FALSE;
				571	}
				572	break;
				573
				574	case doExprFinished:
				575	break;
				576
				577	case doRuleErrorAssignExpr:
				578	error(U_BRK_ASSIGN_ERROR);
				579	returnVal = FALSE;
				580	break;
				581
				582	case doExit:
				583	returnVal = FALSE;
				584	break;
				585
				586	case doScanUnicodeSet:
				587	scanSet();
				588	break;
				589
				590	default:
				591	error(U_BRK_INTERNAL_ERROR);
				592	returnVal = FALSE;
				593	break;
				594	}
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	595	return returnVal && U_SUCCESS(*fRB->fStatus);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	596	}
				597
				598
				599
				600
				601	//------------------------------------------------------------------------------
				602	//
				603	// Error Report a rule parse error.
				604	// Only report it if no previous error has been recorded.
				605	//
				606	//------------------------------------------------------------------------------
				607	void RBBIRuleScanner::error(UErrorCode e) {
				608	if (U_SUCCESS(*fRB->fStatus)) {
				609	*fRB->fStatus = e;
				610	if (fRB->fParseError) {
				611	fRB->fParseError->line = fLineNum;
				612	fRB->fParseError->offset = fCharNum;
				613	fRB->fParseError->preContext[0] = 0;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	614	fRB->fParseError->postContext[0] = 0;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	615	}
				616	}
				617	}
				618
				619
				620
				621
				622	//------------------------------------------------------------------------------
				623	//
				624	// fixOpStack The parse stack holds partially assembled chunks of the parse tree.
				625	// An entry on the stack may be as small as a single setRef node,
				626	// or as large as the parse tree
				627	// for an entire expression (this will be the one item left on the stack
				628	// when the parsing of an RBBI rule completes.
				629	//
				630	// This function is called when a binary operator is encountered.
				631	// It looks back up the stack for operators that are not yet associated
				632	// with a right operand, and if the precedence of the stacked operator >=
				633	// the precedence of the current operator, binds the operand left,
				634	// to the previously encountered operator.
				635	//
				636	//------------------------------------------------------------------------------
				637	void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
				638	RBBINode *n;
				639	// printNodeStack("entering fixOpStack()");
				640	for (;;) {
				641	n = fNodeStack[fNodeStackPtr-1]; // an operator node
				642	if (n->fPrecedence == 0) {
				643	RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
				644	error(U_BRK_INTERNAL_ERROR);
				645	return;
				646	}
				647
				648	if (n->fPrecedence < p \|\| n->fPrecedence <= RBBINode::precLParen) {
				649	// The most recent operand goes with the current operator,
				650	// not with the previously stacked one.
				651	break;
				652	}
				653	// Stack operator is a binary op ( '\|' or concatenation)
				654	// TOS operand becomes right child of this operator.
				655	// Resulting subexpression becomes the TOS operand.
				656	n->fRightChild = fNodeStack[fNodeStackPtr];
				657	fNodeStack[fNodeStackPtr]->fParent = n;
				658	fNodeStackPtr--;
				659	// printNodeStack("looping in fixOpStack() ");
				660	}
				661
				662	if (p <= RBBINode::precLParen) {
				663	// Scan is at a right paren or end of expression.
				664	// The scanned item must match the stack, or else there was an error.
				665	// Discard the left paren (or start expr) node from the stack,
				666	// leaving the completed (sub)expression as TOS.
				667	if (n->fPrecedence != p) {
				668	// Right paren encountered matched start of expression node, or
				669	// end of expression matched with a left paren node.
				670	error(U_BRK_MISMATCHED_PAREN);
				671	}
				672	fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
				673	fNodeStackPtr--;
				674	// Delete the now-discarded LParen or Start node.
				675	delete n;
				676	}
				677	// printNodeStack("leaving fixOpStack()");
				678	}
				679
				680
				681
				682
				683	//------------------------------------------------------------------------------
				684	//
				685	// findSetFor given a UnicodeString,
				686	// - find the corresponding Unicode Set (uset node)
				687	// (create one if necessary)
				688	// - Set fLeftChild of the caller's node (should be a setRef node)
				689	// to the uset node
				690	// Maintain a hash table of uset nodes, so the same one is always used
				691	// for the same string.
				692	// If a "to adopt" set is provided and we haven't seen this key before,
				693	// add the provided set to the hash table.
				694	// If the string is one (32 bit) char in length, the set contains
				695	// just one element which is the char in question.
				696	// If the string is "any", return a set containing all chars.
				697	//
				698	//------------------------------------------------------------------------------
				699	void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode node, UnicodeSet setToAdopt) {
				700
				701	RBBISetTableEl *el;
				702
				703	// First check whether we've already cached a set for this string.
				704	// If so, just use the cached set in the new node.
				705	// delete any set provided by the caller, since we own it.
				706	el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
				707	if (el != NULL) {
				708	delete setToAdopt;
				709	node->fLeftChild = el->val;
				710	U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
				711	return;
				712	}
				713
				714	// Haven't seen this set before.
				715	// If the caller didn't provide us with a prebuilt set,
				716	// create a new UnicodeSet now.
				717	if (setToAdopt == NULL) {
				718	if (s.compare(kAny, -1) == 0) {
				719	setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
				720	} else {
				721	UChar32 c;
				722	c = s.char32At(0);
				723	setToAdopt = new UnicodeSet(c, c);
				724	}
				725	}
				726
				727	//
				728	// Make a new uset node to refer to this UnicodeSet
				729	// This new uset node becomes the child of the caller's setReference node.
				730	//
				731	RBBINode *usetNode = new RBBINode(RBBINode::uset);
				732	if (usetNode == NULL) {
				733	error(U_MEMORY_ALLOCATION_ERROR);
				734	return;
				735	}
				736	usetNode->fInputSet = setToAdopt;
				737	usetNode->fParent = node;
				738	node->fLeftChild = usetNode;
				739	usetNode->fText = s;
				740
				741
				742	//
				743	// Add the new uset node to the list of all uset nodes.
				744	//
				745	fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
				746
				747
				748	//
				749	// Add the new set to the set hash table.
				750	//
				751	el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
				752	UnicodeString *tkey = new UnicodeString(s);
				753	if (tkey == NULL \|\| el == NULL \|\| setToAdopt == NULL) {
				754	// Delete to avoid memory leak
				755	delete tkey;
				756	tkey = NULL;
				757	uprv_free(el);
				758	el = NULL;
				759	delete setToAdopt;
				760	setToAdopt = NULL;
				761
				762	error(U_MEMORY_ALLOCATION_ERROR);
				763	return;
				764	}
				765	el->key = tkey;
				766	el->val = usetNode;
				767	uhash_put(fSetTable, el->key, el, fRB->fStatus);
				768
				769	return;
				770	}
				771
				772
				773
				774	//
				775	// Assorted Unicode character constants.
				776	// Numeric because there is no portable way to enter them as literals.
				777	// (Think EBCDIC).
				778	//
				779	static const UChar chCR = 0x0d; // New lines, for terminating comments.
				780	static const UChar chLF = 0x0a;
				781	static const UChar chNEL = 0x85; // NEL newline variant
				782	static const UChar chLS = 0x2028; // Unicode Line Separator
				783	static const UChar chApos = 0x27; // single quote, for quoted chars.
				784	static const UChar chPound = 0x23; // '#', introduces a comment.
				785	static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
				786	static const UChar chLParen = 0x28;
				787	static const UChar chRParen = 0x29;
				788
				789
				790	//------------------------------------------------------------------------------
				791	//
				792	// stripRules Return a rules string without unnecessary
				793	// characters.
				794	//
				795	//------------------------------------------------------------------------------
				796	UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
				797	UnicodeString strippedRules;
				798	int rulesLength = rules.length();
				799	for (int idx = 0; idx < rulesLength; ) {
				800	UChar ch = rules[idx++];
				801	if (ch == chPound) {
				802	while (idx < rulesLength
				803	&& ch != chCR && ch != chLF && ch != chNEL)
				804	{
				805	ch = rules[idx++];
				806	}
				807	}
				808	if (!u_isISOControl(ch)) {
				809	strippedRules.append(ch);
				810	}
				811	}
				812	// strippedRules = strippedRules.unescape();
				813	return strippedRules;
				814	}
				815
				816
				817	//------------------------------------------------------------------------------
				818	//
				819	// nextCharLL Low Level Next Char from rule input source.
				820	// Get a char from the input character iterator,
				821	// keep track of input position for error reporting.
				822	//
				823	//------------------------------------------------------------------------------
				824	UChar32 RBBIRuleScanner::nextCharLL() {
				825	UChar32 ch;
				826
				827	if (fNextIndex >= fRB->fRules.length()) {
				828	return (UChar32)-1;
				829	}
				830	ch = fRB->fRules.char32At(fNextIndex);
				831	fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
				832
				833	if (ch == chCR \|\|
				834	ch == chNEL \|\|
				835	ch == chLS \|\|
				836	(ch == chLF && fLastChar != chCR)) {
				837	// Character is starting a new line. Bump up the line number, and
				838	// reset the column to 0.
				839	fLineNum++;
				840	fCharNum=0;
				841	if (fQuoteMode) {
				842	error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
				843	fQuoteMode = FALSE;
				844	}
				845	}
				846	else {
				847	// Character is not starting a new line. Except in the case of a
				848	// LF following a CR, increment the column position.
				849	if (ch != chLF) {
				850	fCharNum++;
				851	}
				852	}
				853	fLastChar = ch;
				854	return ch;
				855	}
				856
				857
				858	//------------------------------------------------------------------------------
				859	//
				860	// nextChar for rules scanning. At this level, we handle stripping
				861	// out comments and processing backslash character escapes.
				862	// The rest of the rules grammar is handled at the next level up.
				863	//
				864	//------------------------------------------------------------------------------
				865	void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
				866
				867	// Unicode Character constants needed for the processing done by nextChar(),
				868	// in hex because literals wont work on EBCDIC machines.
				869
				870	fScanIndex = fNextIndex;
				871	c.fChar = nextCharLL();
				872	c.fEscaped = FALSE;
				873
				874	//
				875	// check for '' sequence.
				876	// These are recognized in all contexts, whether in quoted text or not.
				877	//
				878	if (c.fChar == chApos) {
				879	if (fRB->fRules.char32At(fNextIndex) == chApos) {
				880	c.fChar = nextCharLL(); // get nextChar officially so character counts
				881	c.fEscaped = TRUE; // stay correct.
				882	}
				883	else
				884	{
				885	// Single quote, by itself.
				886	// Toggle quoting mode.
				887	// Return either '(' or ')', because quotes cause a grouping of the quoted text.
				888	fQuoteMode = !fQuoteMode;
				889	if (fQuoteMode == TRUE) {
				890	c.fChar = chLParen;
				891	} else {
				892	c.fChar = chRParen;
				893	}
				894	c.fEscaped = FALSE; // The paren that we return is not escaped.
				895	return;
				896	}
				897	}
				898
				899	if (fQuoteMode) {
				900	c.fEscaped = TRUE;
				901	}
				902	else
				903	{
				904	// We are not in a 'quoted region' of the source.
				905	//
				906	if (c.fChar == chPound) {
				907	// Start of a comment. Consume the rest of it.
				908	// The new-line char that terminates the comment is always returned.
				909	// It will be treated as white-space, and serves to break up anything
				910	// that might otherwise incorrectly clump together with a comment in
				911	// the middle (a variable name, for example.)
				912	for (;;) {
				913	c.fChar = nextCharLL();
				914	if (c.fChar == (UChar32)-1 \|\| // EOF
				915	c.fChar == chCR \|\|
				916	c.fChar == chLF \|\|
				917	c.fChar == chNEL \|\|
				918	c.fChar == chLS) {break;}
				919	}
				920	}
				921	if (c.fChar == (UChar32)-1) {
				922	return;
				923	}
				924
				925	//
				926	// check for backslash escaped characters.
				927	// Use UnicodeString::unescapeAt() to handle them.
				928	//
				929	if (c.fChar == chBackSlash) {
				930	c.fEscaped = TRUE;
				931	int32_t startX = fNextIndex;
				932	c.fChar = fRB->fRules.unescapeAt(fNextIndex);
				933	if (fNextIndex == startX) {
				934	error(U_BRK_HEX_DIGITS_EXPECTED);
				935	}
				936	fCharNum += fNextIndex-startX;
				937	}
				938	}
				939	// putc(c.fChar, stdout);
				940	}
				941
				942	//------------------------------------------------------------------------------
				943	//
				944	// Parse RBBI rules. The state machine for rules parsing is here.
				945	// The state tables are hand-written in the file rbbirpt.txt,
				946	// and converted to the form used here by a perl
				947	// script rbbicst.pl
				948	//
				949	//------------------------------------------------------------------------------
				950	void RBBIRuleScanner::parse() {
				951	uint16_t state;
				952	const RBBIRuleTableEl *tableEl;
				953
				954	if (U_FAILURE(*fRB->fStatus)) {
				955	return;
				956	}
				957
				958	state = 1;
				959	nextChar(fC);
				960	//
				961	// Main loop for the rule parsing state machine.
				962	// Runs once per state transition.
				963	// Each time through optionally performs, depending on the state table,
				964	// - an advance to the the next input char
				965	// - an action to be performed.
				966	// - pushing or popping a state to/from the local state return stack.
				967	//
				968	for (;;) {
				969	// Bail out if anything has gone wrong.
				970	// RBBI rule file parsing stops on the first error encountered.
				971	if (U_FAILURE(*fRB->fStatus)) {
				972	break;
				973	}
				974
				975	// Quit if state == 0. This is the normal way to exit the state machine.
				976	//
				977	if (state == 0) {
				978	break;
				979	}
				980
				981	// Find the state table element that matches the input char from the rule, or the
				982	// class of the input character. Start with the first table row for this
				983	// state, then linearly scan forward until we find a row that matches the
				984	// character. The last row for each state always matches all characters, so
				985	// the search will stop there, if not before.
				986	//
				987	tableEl = &gRuleParseStateTable[state];
				988	#ifdef RBBI_DEBUG
				989	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
				990	RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
				991	fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
				992	}
				993	#endif
				994
				995	for (;;) {
				996	#ifdef RBBI_DEBUG
				997	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
				998	#endif
				999	if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
				1000	// Table row specified an individual character, not a set, and
				1001	// the input character is not escaped, and
				1002	// the input character matched it.
				1003	break;
				1004	}
				1005	if (tableEl->fCharClass == 255) {
				1006	// Table row specified default, match anything character class.
				1007	break;
				1008	}
				1009	if (tableEl->fCharClass == 254 && fC.fEscaped) {
				1010	// Table row specified "escaped" and the char was escaped.
				1011	break;
				1012	}
				1013	if (tableEl->fCharClass == 253 && fC.fEscaped &&
				1014	(fC.fChar == 0x50 \|\| fC.fChar == 0x70 )) {
				1015	// Table row specified "escaped P" and the char is either 'p' or 'P'.
				1016	break;
				1017	}
				1018	if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {
				1019	// Table row specified eof and we hit eof on the input.
				1020	break;
				1021	}
				1022
				1023	if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
				1024	fC.fEscaped == FALSE && // char is not escaped &&
				1025	fC.fChar != (UChar32)-1) { // char is not EOF
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1026	U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1027	if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
				1028	// Table row specified a character class, or set of characters,
				1029	// and the current char matches it.
				1030	break;
				1031	}
				1032	}
				1033
				1034	// No match on this row, advance to the next row for this state,
				1035	tableEl++;
				1036	}
				1037	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
				1038
				1039	//
				1040	// We've found the row of the state table that matches the current input
				1041	// character from the rules string.
				1042	// Perform any action specified by this row in the state table.
				1043	if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
				1044	// Break out of the state machine loop if the
				1045	// the action signalled some kind of error, or
				1046	// the action was to exit, occurs on normal end-of-rules-input.
				1047	break;
				1048	}
				1049
				1050	if (tableEl->fPushState != 0) {
				1051	fStackPtr++;
				1052	if (fStackPtr >= kStackSize) {
				1053	error(U_BRK_INTERNAL_ERROR);
				1054	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
				1055	fStackPtr--;
				1056	}
				1057	fStack[fStackPtr] = tableEl->fPushState;
				1058	}
				1059
				1060	if (tableEl->fNextChar) {
				1061	nextChar(fC);
				1062	}
				1063
				1064	// Get the next state from the table entry, or from the
				1065	// state stack if the next state was specified as "pop".
				1066	if (tableEl->fNextState != 255) {
				1067	state = tableEl->fNextState;
				1068	} else {
				1069	state = fStack[fStackPtr];
				1070	fStackPtr--;
				1071	if (fStackPtr < 0) {
				1072	error(U_BRK_INTERNAL_ERROR);
				1073	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
				1074	fStackPtr++;
				1075	}
				1076	}
				1077
				1078	}
				1079
				1080	//
				1081	// If there were NO user specified reverse rules, set up the equivalent of ".*;"
				1082	//
				1083	if (fRB->fReverseTree == NULL) {
				1084	fRB->fReverseTree = pushNewNode(RBBINode::opStar);
				1085	RBBINode *operand = pushNewNode(RBBINode::setRef);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	1086	if (U_FAILURE(*fRB->fStatus)) {
				1087	return;
				1088	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1089	findSetFor(UnicodeString(TRUE, kAny, 3), operand);
				1090	fRB->fReverseTree->fLeftChild = operand;
				1091	operand->fParent = fRB->fReverseTree;
				1092	fNodeStackPtr -= 2;
				1093	}
				1094
				1095
				1096	//
				1097	// Parsing of the input RBBI rules is complete.
				1098	// We now have a parse tree for the rule expressions
				1099	// and a list of all UnicodeSets that are referenced.
				1100	//
				1101	#ifdef RBBI_DEBUG
				1102	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
				1103	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
				1104	{
				1105	RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
				1106	fRB->fForwardTree->printTree(TRUE);
				1107	RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
				1108	fRB->fReverseTree->printTree(TRUE);
				1109	RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
				1110	fRB->fSafeFwdTree->printTree(TRUE);
				1111	RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
				1112	fRB->fSafeRevTree->printTree(TRUE);
				1113	}
				1114	#endif
				1115	}
				1116
				1117
				1118	//------------------------------------------------------------------------------
				1119	//
				1120	// printNodeStack for debugging...
				1121	//
				1122	//------------------------------------------------------------------------------
				1123	#ifdef RBBI_DEBUG
				1124	void RBBIRuleScanner::printNodeStack(const char *title) {
				1125	int i;
				1126	RBBIDebugPrintf("%s. Dumping node stack...\n", title);
				1127	for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}
				1128	}
				1129	#endif
				1130
				1131
				1132
				1133
				1134	//------------------------------------------------------------------------------
				1135	//
				1136	// pushNewNode create a new RBBINode of the specified type and push it
				1137	// onto the stack of nodes.
				1138	//
				1139	//------------------------------------------------------------------------------
				1140	RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	1141	if (U_FAILURE(*fRB->fStatus)) {
				1142	return NULL;
				1143	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1144	fNodeStackPtr++;
				1145	if (fNodeStackPtr >= kStackSize) {
				1146	error(U_BRK_INTERNAL_ERROR);
				1147	RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
				1148	*fRB->fStatus = U_BRK_INTERNAL_ERROR;
				1149	return NULL;
				1150	}
				1151	fNodeStack[fNodeStackPtr] = new RBBINode(t);
				1152	if (fNodeStack[fNodeStackPtr] == NULL) {
				1153	*fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
				1154	}
				1155	return fNodeStack[fNodeStackPtr];
				1156	}
				1157
				1158
				1159
				1160	//------------------------------------------------------------------------------
				1161	//
				1162	// scanSet Construct a UnicodeSet from the text at the current scan
				1163	// position. Advance the scan position to the first character
				1164	// after the set.
				1165	//
				1166	// A new RBBI setref node referring to the set is pushed onto the node
				1167	// stack.
				1168	//
				1169	// The scan position is normally under the control of the state machine
				1170	// that controls rule parsing. UnicodeSets, however, are parsed by
				1171	// the UnicodeSet constructor, not by the RBBI rule parser.
				1172	//
				1173	//------------------------------------------------------------------------------
				1174	void RBBIRuleScanner::scanSet() {
				1175	UnicodeSet *uset;
				1176	ParsePosition pos;
				1177	int startPos;
				1178	int i;
				1179
				1180	if (U_FAILURE(*fRB->fStatus)) {
				1181	return;
				1182	}
				1183
				1184	pos.setIndex(fScanIndex);
				1185	startPos = fScanIndex;
				1186	UErrorCode localStatus = U_ZERO_ERROR;
				1187	uset = new UnicodeSet();
				1188	if (uset == NULL) {
				1189	localStatus = U_MEMORY_ALLOCATION_ERROR;
				1190	} else {
				1191	uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
				1192	}
				1193	if (U_FAILURE(localStatus)) {
				1194	// TODO: Get more accurate position of the error from UnicodeSet's return info.
				1195	// UnicodeSet appears to not be reporting correctly at this time.
				1196	#ifdef RBBI_DEBUG
				1197	RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
				1198	#endif
				1199	error(localStatus);
				1200	delete uset;
				1201	return;
				1202	}
				1203
				1204	// Verify that the set contains at least one code point.
				1205	//
				1206	U_ASSERT(uset!=NULL);
				1207	if (uset->isEmpty()) {
				1208	// This set is empty.
				1209	// Make it an error, because it almost certainly is not what the user wanted.
				1210	// Also, avoids having to think about corner cases in the tree manipulation code
				1211	// that occurs later on.
				1212	error(U_BRK_RULE_EMPTY_SET);
				1213	delete uset;
				1214	return;
				1215	}
				1216
				1217
				1218	// Advance the RBBI parse postion over the UnicodeSet pattern.
				1219	// Don't just set fScanIndex because the line/char positions maintained
				1220	// for error reporting would be thrown off.
				1221	i = pos.getIndex();
				1222	for (;;) {
				1223	if (fNextIndex >= i) {
				1224	break;
				1225	}
				1226	nextCharLL();
				1227	}
				1228
				1229	if (U_SUCCESS(*fRB->fStatus)) {
				1230	RBBINode *n;
				1231
				1232	n = pushNewNode(RBBINode::setRef);
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	1233	if (U_FAILURE(*fRB->fStatus)) {
				1234	return;
				1235	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1236	n->fFirstPos = startPos;
				1237	n->fLastPos = fNextIndex;
				1238	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
				1239	// findSetFor() serves several purposes here:
				1240	// - Adopts storage for the UnicodeSet, will be responsible for deleting.
				1241	// - Mantains collection of all sets in use, needed later for establishing
				1242	// character categories for run time engine.
				1243	// - Eliminates mulitiple instances of the same set.
				1244	// - Creates a new uset node if necessary (if this isn't a duplicate.)
				1245	findSetFor(n->fText, n, uset);
				1246	}
				1247
				1248	}
				1249
				1250	U_NAMESPACE_END
				1251
				1252	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */