Blame - source/i18n/collationruleparser.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 7fb95c0b2b597d422f13f52c3f9596d1be6833dc [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	3	/*
				4	*******************************************************************************
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	5	* Copyright (C) 2013-2015, International Business Machines
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	6	* Corporation and others. All Rights Reserved.
				7	*******************************************************************************
				8	* collationruleparser.cpp
				9	*
				10	* (replaced the former ucol_tok.cpp)
				11	*
				12	* created on: 2013apr10
				13	* created by: Markus W. Scherer
				14	*/
				15
				16	#include "unicode/utypes.h"
				17
				18	#if !UCONFIG_NO_COLLATION
				19
				20	#include "unicode/normalizer2.h"
				21	#include "unicode/parseerr.h"
				22	#include "unicode/uchar.h"
				23	#include "unicode/ucol.h"
				24	#include "unicode/uloc.h"
				25	#include "unicode/unistr.h"
				26	#include "unicode/utf16.h"
				27	#include "charstr.h"
				28	#include "cmemory.h"
				29	#include "collation.h"
				30	#include "collationdata.h"
				31	#include "collationruleparser.h"
				32	#include "collationsettings.h"
				33	#include "collationtailoring.h"
				34	#include "cstring.h"
				35	#include "patternprops.h"
				36	#include "uassert.h"
				37	#include "uvectr32.h"
				38
				39	U_NAMESPACE_BEGIN
				40
				41	namespace {
				42
				43	static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
				44	const int32_t BEFORE_LENGTH = 7;
				45
				46	} // namespace
				47
				48	CollationRuleParser::Sink::~Sink() {}
				49
				50	void
				51	CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
				52
				53	void
				54	CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
				55
				56	CollationRuleParser::Importer::~Importer() {}
				57
				58	CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
				59	: nfd(*Normalizer2::getNFDInstance(errorCode)),
				60	nfc(*Normalizer2::getNFCInstance(errorCode)),
				61	rules(NULL), baseData(base), settings(NULL),
				62	parseError(NULL), errorReason(NULL),
				63	sink(NULL), importer(NULL),
				64	ruleIndex(0) {
				65	}
				66
				67	CollationRuleParser::~CollationRuleParser() {
				68	}
				69
				70	void
				71	CollationRuleParser::parse(const UnicodeString &ruleString,
				72	CollationSettings &outSettings,
				73	UParseError *outParseError,
				74	UErrorCode &errorCode) {
				75	if(U_FAILURE(errorCode)) { return; }
				76	settings = &outSettings;
				77	parseError = outParseError;
				78	if(parseError != NULL) {
				79	parseError->line = 0;
				80	parseError->offset = -1;
				81	parseError->preContext[0] = 0;
				82	parseError->postContext[0] = 0;
				83	}
				84	errorReason = NULL;
				85	parse(ruleString, errorCode);
				86	}
				87
				88	void
				89	CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
				90	if(U_FAILURE(errorCode)) { return; }
				91	rules = &ruleString;
				92	ruleIndex = 0;
				93
				94	while(ruleIndex < rules->length()) {
				95	UChar c = rules->charAt(ruleIndex);
				96	if(PatternProps::isWhiteSpace(c)) {
				97	++ruleIndex;
				98	continue;
				99	}
				100	switch(c) {
				101	case 0x26: // '&'
				102	parseRuleChain(errorCode);
				103	break;
				104	case 0x5b: // '['
				105	parseSetting(errorCode);
				106	break;
				107	case 0x23: // '#' starts a comment, until the end of the line
				108	ruleIndex = skipComment(ruleIndex + 1);
				109	break;
				110	case 0x40: // '@' is equivalent to [backwards 2]
				111	settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
				112	UCOL_ON, 0, errorCode);
				113	++ruleIndex;
				114	break;
				115	case 0x21: // '!' used to turn on Thai/Lao character reversal
				116	// Accept but ignore. The root collator has contractions
				117	// that are equivalent to the character reversal, where appropriate.
				118	++ruleIndex;
				119	break;
				120	default:
				121	setParseError("expected a reset or setting or comment", errorCode);
				122	break;
				123	}
				124	if(U_FAILURE(errorCode)) { return; }
				125	}
				126	}
				127
				128	void
				129	CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
				130	int32_t resetStrength = parseResetAndPosition(errorCode);
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	131	UBool isFirstRelation = true;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	132	for(;;) {
				133	int32_t result = parseRelationOperator(errorCode);
				134	if(U_FAILURE(errorCode)) { return; }
				135	if(result < 0) {
				136	if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
				137	// '#' starts a comment, until the end of the line
				138	ruleIndex = skipComment(ruleIndex + 1);
				139	continue;
				140	}
				141	if(isFirstRelation) {
				142	setParseError("reset not followed by a relation", errorCode);
				143	}
				144	return;
				145	}
				146	int32_t strength = result & STRENGTH_MASK;
				147	if(resetStrength < UCOL_IDENTICAL) {
				148	// reset-before rule chain
				149	if(isFirstRelation) {
				150	if(strength != resetStrength) {
				151	setParseError("reset-before strength differs from its first relation", errorCode);
				152	return;
				153	}
				154	} else {
				155	if(strength < resetStrength) {
				156	setParseError("reset-before strength followed by a stronger relation", errorCode);
				157	return;
				158	}
				159	}
				160	}
				161	int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
				162	if((result & STARRED_FLAG) == 0) {
				163	parseRelationStrings(strength, i, errorCode);
				164	} else {
				165	parseStarredCharacters(strength, i, errorCode);
				166	}
				167	if(U_FAILURE(errorCode)) { return; }
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	168	isFirstRelation = false;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	169	}
				170	}
				171
				172	int32_t
				173	CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
				174	if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
				175	int32_t i = skipWhiteSpace(ruleIndex + 1);
				176	int32_t j;
				177	UChar c;
				178	int32_t resetStrength;
				179	if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
				180	(j = i + BEFORE_LENGTH) < rules->length() &&
				181	PatternProps::isWhiteSpace(rules->charAt(j)) &&
				182	((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
				183	0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
				184	rules->charAt(j + 1) == 0x5d) {
				185	// &[before n] with n=1 or 2 or 3
				186	resetStrength = UCOL_PRIMARY + (c - 0x31);
				187	i = skipWhiteSpace(j + 2);
				188	} else {
				189	resetStrength = UCOL_IDENTICAL;
				190	}
				191	if(i >= rules->length()) {
				192	setParseError("reset without position", errorCode);
				193	return UCOL_DEFAULT;
				194	}
				195	UnicodeString str;
				196	if(rules->charAt(i) == 0x5b) { // '['
				197	i = parseSpecialPosition(i, str, errorCode);
				198	} else {
				199	i = parseTailoringString(i, str, errorCode);
				200	}
				201	sink->addReset(resetStrength, str, errorReason, errorCode);
				202	if(U_FAILURE(errorCode)) { setErrorContext(); }
				203	ruleIndex = i;
				204	return resetStrength;
				205	}
				206
				207	int32_t
				208	CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
				209	if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
				210	ruleIndex = skipWhiteSpace(ruleIndex);
				211	if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
				212	int32_t strength;
				213	int32_t i = ruleIndex;
				214	UChar c = rules->charAt(i++);
				215	switch(c) {
				216	case 0x3c: // '<'
				217	if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
				218	++i;
				219	if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
				220	++i;
				221	if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
				222	++i;
				223	strength = UCOL_QUATERNARY;
				224	} else {
				225	strength = UCOL_TERTIARY;
				226	}
				227	} else {
				228	strength = UCOL_SECONDARY;
				229	}
				230	} else {
				231	strength = UCOL_PRIMARY;
				232	}
				233	if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
				234	++i;
				235	strength \|= STARRED_FLAG;
				236	}
				237	break;
				238	case 0x3b: // ';' same as <<
				239	strength = UCOL_SECONDARY;
				240	break;
				241	case 0x2c: // ',' same as <<<
				242	strength = UCOL_TERTIARY;
				243	break;
				244	case 0x3d: // '='
				245	strength = UCOL_IDENTICAL;
				246	if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
				247	++i;
				248	strength \|= STARRED_FLAG;
				249	}
				250	break;
				251	default:
				252	return UCOL_DEFAULT;
				253	}
				254	return ((i - ruleIndex) << OFFSET_SHIFT) \| strength;
				255	}
				256
				257	void
				258	CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
				259	// Parse
				260	// prefix \| str / extension
				261	// where prefix and extension are optional.
				262	UnicodeString prefix, str, extension;
				263	i = parseTailoringString(i, str, errorCode);
				264	if(U_FAILURE(errorCode)) { return; }
				265	UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
				266	if(next == 0x7c) { // '\|' separates the context prefix from the string.
				267	prefix = str;
				268	i = parseTailoringString(i + 1, str, errorCode);
				269	if(U_FAILURE(errorCode)) { return; }
				270	next = (i < rules->length()) ? rules->charAt(i) : 0;
				271	}
				272	if(next == 0x2f) { // '/' separates the string from the extension.
				273	i = parseTailoringString(i + 1, extension, errorCode);
				274	}
				275	if(!prefix.isEmpty()) {
				276	UChar32 prefix0 = prefix.char32At(0);
				277	UChar32 c = str.char32At(0);
				278	if(!nfc.hasBoundaryBefore(prefix0) \|\| !nfc.hasBoundaryBefore(c)) {
				279	setParseError("in 'prefix\|str', prefix and str must each start with an NFC boundary",
				280	errorCode);
				281	return;
				282	}
				283	}
				284	sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
				285	if(U_FAILURE(errorCode)) { setErrorContext(); }
				286	ruleIndex = i;
				287	}
				288
				289	void
				290	CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
				291	UnicodeString empty, raw;
				292	i = parseString(skipWhiteSpace(i), raw, errorCode);
				293	if(U_FAILURE(errorCode)) { return; }
				294	if(raw.isEmpty()) {
				295	setParseError("missing starred-relation string", errorCode);
				296	return;
				297	}
				298	UChar32 prev = -1;
				299	int32_t j = 0;
				300	for(;;) {
				301	while(j < raw.length()) {
				302	UChar32 c = raw.char32At(j);
				303	if(!nfd.isInert(c)) {
				304	setParseError("starred-relation string is not all NFD-inert", errorCode);
				305	return;
				306	}
				307	sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
				308	if(U_FAILURE(errorCode)) {
				309	setErrorContext();
				310	return;
				311	}
				312	j += U16_LENGTH(c);
				313	prev = c;
				314	}
				315	if(i >= rules->length() \|\| rules->charAt(i) != 0x2d) { // '-'
				316	break;
				317	}
				318	if(prev < 0) {
				319	setParseError("range without start in starred-relation string", errorCode);
				320	return;
				321	}
				322	i = parseString(i + 1, raw, errorCode);
				323	if(U_FAILURE(errorCode)) { return; }
				324	if(raw.isEmpty()) {
				325	setParseError("range without end in starred-relation string", errorCode);
				326	return;
				327	}
				328	UChar32 c = raw.char32At(0);
				329	if(c < prev) {
				330	setParseError("range start greater than end in starred-relation string", errorCode);
				331	return;
				332	}
				333	// range prev-c
				334	UnicodeString s;
				335	while(++prev <= c) {
				336	if(!nfd.isInert(prev)) {
				337	setParseError("starred-relation string range is not all NFD-inert", errorCode);
				338	return;
				339	}
				340	if(U_IS_SURROGATE(prev)) {
				341	setParseError("starred-relation string range contains a surrogate", errorCode);
				342	return;
				343	}
				344	if(0xfffd <= prev && prev <= 0xffff) {
				345	setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
				346	return;
				347	}
				348	s.setTo(prev);
				349	sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
				350	if(U_FAILURE(errorCode)) {
				351	setErrorContext();
				352	return;
				353	}
				354	}
				355	prev = -1;
				356	j = U16_LENGTH(c);
				357	}
				358	ruleIndex = skipWhiteSpace(i);
				359	}
				360
				361	int32_t
				362	CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
				363	i = parseString(skipWhiteSpace(i), raw, errorCode);
				364	if(U_SUCCESS(errorCode) && raw.isEmpty()) {
				365	setParseError("missing relation string", errorCode);
				366	}
				367	return skipWhiteSpace(i);
				368	}
				369
				370	int32_t
				371	CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
				372	if(U_FAILURE(errorCode)) { return i; }
				373	raw.remove();
				374	while(i < rules->length()) {
				375	UChar32 c = rules->charAt(i++);
				376	if(isSyntaxChar(c)) {
				377	if(c == 0x27) { // apostrophe
				378	if(i < rules->length() && rules->charAt(i) == 0x27) {
				379	// Double apostrophe, encodes a single one.
				380	raw.append((UChar)0x27);
				381	++i;
				382	continue;
				383	}
				384	// Quote literal text until the next single apostrophe.
				385	for(;;) {
				386	if(i == rules->length()) {
				387	setParseError("quoted literal text missing terminating apostrophe", errorCode);
				388	return i;
				389	}
				390	c = rules->charAt(i++);
				391	if(c == 0x27) {
				392	if(i < rules->length() && rules->charAt(i) == 0x27) {
				393	// Double apostrophe inside quoted literal text,
				394	// still encodes a single apostrophe.
				395	++i;
				396	} else {
				397	break;
				398	}
				399	}
				400	raw.append((UChar)c);
				401	}
				402	} else if(c == 0x5c) { // backslash
				403	if(i == rules->length()) {
				404	setParseError("backslash escape at the end of the rule string", errorCode);
				405	return i;
				406	}
				407	c = rules->char32At(i);
				408	raw.append(c);
				409	i += U16_LENGTH(c);
				410	} else {
				411	// Any other syntax character terminates a string.
				412	--i;
				413	break;
				414	}
				415	} else if(PatternProps::isWhiteSpace(c)) {
				416	// Unquoted white space terminates a string.
				417	--i;
				418	break;
				419	} else {
				420	raw.append((UChar)c);
				421	}
				422	}
				423	for(int32_t j = 0; j < raw.length();) {
				424	UChar32 c = raw.char32At(j);
				425	if(U_IS_SURROGATE(c)) {
				426	setParseError("string contains an unpaired surrogate", errorCode);
				427	return i;
				428	}
				429	if(0xfffd <= c && c <= 0xffff) {
				430	setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
				431	return i;
				432	}
				433	j += U16_LENGTH(c);
				434	}
				435	return i;
				436	}
				437
				438	namespace {
				439
				440	static const char *const positions[] = {
				441	"first tertiary ignorable",
				442	"last tertiary ignorable",
				443	"first secondary ignorable",
				444	"last secondary ignorable",
				445	"first primary ignorable",
				446	"last primary ignorable",
				447	"first variable",
				448	"last variable",
				449	"first regular",
				450	"last regular",
				451	"first implicit",
				452	"last implicit",
				453	"first trailing",
				454	"last trailing"
				455	};
				456
				457	} // namespace
				458
				459	int32_t
				460	CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
				461	if(U_FAILURE(errorCode)) { return 0; }
				462	UnicodeString raw;
				463	int32_t j = readWords(i + 1, raw);
				464	if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
				465	++j;
				466	for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
				467	if(raw == UnicodeString(positions[pos], -1, US_INV)) {
				468	str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
				469	return j;
				470	}
				471	}
				472	if(raw == UNICODE_STRING_SIMPLE("top")) {
				473	str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
				474	return j;
				475	}
				476	if(raw == UNICODE_STRING_SIMPLE("variable top")) {
				477	str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
				478	return j;
				479	}
				480	}
				481	setParseError("not a valid special reset position", errorCode);
				482	return i;
				483	}
				484
				485	void
				486	CollationRuleParser::parseSetting(UErrorCode &errorCode) {
				487	if(U_FAILURE(errorCode)) { return; }
				488	UnicodeString raw;
				489	int32_t i = ruleIndex + 1;
				490	int32_t j = readWords(i, raw);
				491	if(j <= i \|\| raw.isEmpty()) {
				492	setParseError("expected a setting/option at '['", errorCode);
				493	}
				494	if(rules->charAt(j) == 0x5d) { // words end with ]
				495	++j;
				496	if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
				497	(raw.length() == 7 \|\| raw.charAt(7) == 0x20)) {
				498	parseReordering(raw, errorCode);
				499	ruleIndex = j;
				500	return;
				501	}
				502	if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
				503	settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
				504	UCOL_ON, 0, errorCode);
				505	ruleIndex = j;
				506	return;
				507	}
				508	UnicodeString v;
				509	int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
				510	if(valueIndex >= 0) {
				511	v.setTo(raw, valueIndex + 1);
				512	raw.truncate(valueIndex);
				513	}
				514	if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
				515	int32_t value = UCOL_DEFAULT;
				516	UChar c = v.charAt(0);
				517	if(0x31 <= c && c <= 0x34) { // 1..4
				518	value = UCOL_PRIMARY + (c - 0x31);
				519	} else if(c == 0x49) { // 'I'
				520	value = UCOL_IDENTICAL;
				521	}
				522	if(value != UCOL_DEFAULT) {
				523	settings->setStrength(value, 0, errorCode);
				524	ruleIndex = j;
				525	return;
				526	}
				527	} else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
				528	UColAttributeValue value = UCOL_DEFAULT;
				529	if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
				530	value = UCOL_NON_IGNORABLE;
				531	} else if(v == UNICODE_STRING_SIMPLE("shifted")) {
				532	value = UCOL_SHIFTED;
				533	}
				534	if(value != UCOL_DEFAULT) {
				535	settings->setAlternateHandling(value, 0, errorCode);
				536	ruleIndex = j;
				537	return;
				538	}
				539	} else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
				540	int32_t value = UCOL_DEFAULT;
				541	if(v == UNICODE_STRING_SIMPLE("space")) {
				542	value = CollationSettings::MAX_VAR_SPACE;
				543	} else if(v == UNICODE_STRING_SIMPLE("punct")) {
				544	value = CollationSettings::MAX_VAR_PUNCT;
				545	} else if(v == UNICODE_STRING_SIMPLE("symbol")) {
				546	value = CollationSettings::MAX_VAR_SYMBOL;
				547	} else if(v == UNICODE_STRING_SIMPLE("currency")) {
				548	value = CollationSettings::MAX_VAR_CURRENCY;
				549	}
				550	if(value != UCOL_DEFAULT) {
				551	settings->setMaxVariable(value, 0, errorCode);
				552	settings->variableTop = baseData->getLastPrimaryForGroup(
				553	UCOL_REORDER_CODE_FIRST + value);
				554	U_ASSERT(settings->variableTop != 0);
				555	ruleIndex = j;
				556	return;
				557	}
				558	} else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
				559	UColAttributeValue value = UCOL_DEFAULT;
				560	if(v == UNICODE_STRING_SIMPLE("off")) {
				561	value = UCOL_OFF;
				562	} else if(v == UNICODE_STRING_SIMPLE("lower")) {
				563	value = UCOL_LOWER_FIRST;
				564	} else if(v == UNICODE_STRING_SIMPLE("upper")) {
				565	value = UCOL_UPPER_FIRST;
				566	}
				567	if(value != UCOL_DEFAULT) {
				568	settings->setCaseFirst(value, 0, errorCode);
				569	ruleIndex = j;
				570	return;
				571	}
				572	} else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
				573	UColAttributeValue value = getOnOffValue(v);
				574	if(value != UCOL_DEFAULT) {
				575	settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
				576	ruleIndex = j;
				577	return;
				578	}
				579	} else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
				580	UColAttributeValue value = getOnOffValue(v);
				581	if(value != UCOL_DEFAULT) {
				582	settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
				583	ruleIndex = j;
				584	return;
				585	}
				586	} else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
				587	UColAttributeValue value = getOnOffValue(v);
				588	if(value != UCOL_DEFAULT) {
				589	settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
				590	ruleIndex = j;
				591	return;
				592	}
				593	} else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
				594	UColAttributeValue value = getOnOffValue(v);
				595	if(value != UCOL_DEFAULT) {
				596	if(value == UCOL_ON) {
				597	setParseError("[hiraganaQ on] is not supported", errorCode);
				598	}
				599	ruleIndex = j;
				600	return;
				601	}
				602	} else if(raw == UNICODE_STRING_SIMPLE("import")) {
				603	CharString lang;
				604	lang.appendInvariantChars(v, errorCode);
				605	if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
				606	// BCP 47 language tag -> ICU locale ID
				607	char localeID[ULOC_FULLNAME_CAPACITY];
				608	int32_t parsedLength;
				609	int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
				610	&parsedLength, &errorCode);
				611	if(U_FAILURE(errorCode) \|\|
				612	parsedLength != lang.length() \|\| length >= ULOC_FULLNAME_CAPACITY) {
				613	errorCode = U_ZERO_ERROR;
				614	setParseError("expected language tag in [import langTag]", errorCode);
				615	return;
				616	}
				617	// localeID minus all keywords
				618	char baseID[ULOC_FULLNAME_CAPACITY];
				619	length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
				620	if(U_FAILURE(errorCode) \|\| length >= ULOC_KEYWORDS_CAPACITY) {
				621	errorCode = U_ZERO_ERROR;
				622	setParseError("expected language tag in [import langTag]", errorCode);
				623	return;
				624	}
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	625	if(length == 0) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	626	uprv_strcpy(baseID, "root");
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	627	} else if(*baseID == '_') {
				628	uprv_memmove(baseID + 3, baseID, length + 1);
				629	uprv_memcpy(baseID, "und", 3);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	630	}
				631	// @collation=type, or length=0 if not specified
				632	char collationType[ULOC_KEYWORDS_CAPACITY];
				633	length = uloc_getKeywordValue(localeID, "collation",
				634	collationType, ULOC_KEYWORDS_CAPACITY,
				635	&errorCode);
				636	if(U_FAILURE(errorCode) \|\| length >= ULOC_KEYWORDS_CAPACITY) {
				637	errorCode = U_ZERO_ERROR;
				638	setParseError("expected language tag in [import langTag]", errorCode);
				639	return;
				640	}
				641	if(importer == NULL) {
				642	setParseError("[import langTag] is not supported", errorCode);
				643	} else {
				644	UnicodeString importedRules;
				645	importer->getRules(baseID, length > 0 ? collationType : "standard",
				646	importedRules, errorReason, errorCode);
				647	if(U_FAILURE(errorCode)) {
				648	if(errorReason == NULL) {
				649	errorReason = "[import langTag] failed";
				650	}
				651	setErrorContext();
				652	return;
				653	}
				654	const UnicodeString *outerRules = rules;
				655	int32_t outerRuleIndex = ruleIndex;
				656	parse(importedRules, errorCode);
				657	if(U_FAILURE(errorCode)) {
				658	if(parseError != NULL) {
				659	parseError->offset = outerRuleIndex;
				660	}
				661	}
				662	rules = outerRules;
				663	ruleIndex = j;
				664	}
				665	return;
				666	}
				667	} else if(rules->charAt(j) == 0x5b) { // words end with [
				668	UnicodeSet set;
				669	j = parseUnicodeSet(j, set, errorCode);
				670	if(U_FAILURE(errorCode)) { return; }
				671	if(raw == UNICODE_STRING_SIMPLE("optimize")) {
				672	sink->optimize(set, errorReason, errorCode);
				673	if(U_FAILURE(errorCode)) { setErrorContext(); }
				674	ruleIndex = j;
				675	return;
				676	} else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
				677	sink->suppressContractions(set, errorReason, errorCode);
				678	if(U_FAILURE(errorCode)) { setErrorContext(); }
				679	ruleIndex = j;
				680	return;
				681	}
				682	}
				683	setParseError("not a valid setting/option", errorCode);
				684	}
				685
				686	void
				687	CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
				688	if(U_FAILURE(errorCode)) { return; }
				689	int32_t i = 7; // after "reorder"
				690	if(i == raw.length()) {
				691	// empty [reorder] with no codes
				692	settings->resetReordering();
				693	return;
				694	}
				695	// Parse the codes in [reorder aa bb cc].
				696	UVector32 reorderCodes(errorCode);
				697	if(U_FAILURE(errorCode)) { return; }
				698	CharString word;
				699	while(i < raw.length()) {
				700	++i; // skip the word-separating space
				701	int32_t limit = raw.indexOf((UChar)0x20, i);
				702	if(limit < 0) { limit = raw.length(); }
				703	word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
				704	if(U_FAILURE(errorCode)) { return; }
				705	int32_t code = getReorderCode(word.data());
				706	if(code < 0) {
				707	setParseError("unknown script or reorder code", errorCode);
				708	return;
				709	}
				710	reorderCodes.addElement(code, errorCode);
				711	if(U_FAILURE(errorCode)) { return; }
				712	i = limit;
				713	}
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	714	settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	715	}
				716
				717	static const char *const gSpecialReorderCodes[] = {
				718	"space", "punct", "symbol", "currency", "digit"
				719	};
				720
				721	int32_t
				722	CollationRuleParser::getReorderCode(const char *word) {
				723	for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
				724	if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
				725	return UCOL_REORDER_CODE_FIRST + i;
				726	}
				727	}
				728	int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
				729	if(script >= 0) {
				730	return script;
				731	}
				732	if(uprv_stricmp(word, "others") == 0) {
				733	return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
				734	}
				735	return -1;
				736	}
				737
				738	UColAttributeValue
				739	CollationRuleParser::getOnOffValue(const UnicodeString &s) {
				740	if(s == UNICODE_STRING_SIMPLE("on")) {
				741	return UCOL_ON;
				742	} else if(s == UNICODE_STRING_SIMPLE("off")) {
				743	return UCOL_OFF;
				744	} else {
				745	return UCOL_DEFAULT;
				746	}
				747	}
				748
				749	int32_t
				750	CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
				751	// Collect a UnicodeSet pattern between a balanced pair of [brackets].
				752	int32_t level = 0;
				753	int32_t j = i;
				754	for(;;) {
				755	if(j == rules->length()) {
				756	setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
				757	return j;
				758	}
				759	UChar c = rules->charAt(j++);
				760	if(c == 0x5b) { // '['
				761	++level;
				762	} else if(c == 0x5d) { // ']'
				763	if(--level == 0) { break; }
				764	}
				765	}
				766	set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
				767	if(U_FAILURE(errorCode)) {
				768	errorCode = U_ZERO_ERROR;
				769	setParseError("not a valid UnicodeSet pattern", errorCode);
				770	return j;
				771	}
				772	j = skipWhiteSpace(j);
				773	if(j == rules->length() \|\| rules->charAt(j) != 0x5d) {
				774	setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
				775	return j;
				776	}
				777	return ++j;
				778	}
				779
				780	int32_t
				781	CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
				782	static const UChar sp = 0x20;
				783	raw.remove();
				784	i = skipWhiteSpace(i);
				785	for(;;) {
				786	if(i >= rules->length()) { return 0; }
				787	UChar c = rules->charAt(i);
				788	if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
				789	if(raw.isEmpty()) { return i; }
				790	if(raw.endsWith(&sp, 1)) { // remove trailing space
				791	raw.truncate(raw.length() - 1);
				792	}
				793	return i;
				794	}
				795	if(PatternProps::isWhiteSpace(c)) {
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	796	raw.append(sp);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	797	i = skipWhiteSpace(i + 1);
				798	} else {
				799	raw.append(c);
				800	++i;
				801	}
				802	}
				803	}
				804
				805	int32_t
				806	CollationRuleParser::skipComment(int32_t i) const {
				807	// skip to past the newline
				808	while(i < rules->length()) {
				809	UChar c = rules->charAt(i++);
				810	// LF or FF or CR or NEL or LS or PS
				811	if(c == 0xa \|\| c == 0xc \|\| c == 0xd \|\| c == 0x85 \|\| c == 0x2028 \|\| c == 0x2029) {
				812	// Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
				813	// NLF (new line function) = CR or LF or CR+LF or NEL.
				814	// No need to collect all of CR+LF because a following LF will be ignored anyway.
				815	break;
				816	}
				817	}
				818	return i;
				819	}
				820
				821	void
				822	CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
				823	if(U_FAILURE(errorCode)) { return; }
				824	// Error code consistent with the old parser (from ca. 2001),
				825	// rather than U_PARSE_ERROR;
				826	errorCode = U_INVALID_FORMAT_ERROR;
				827	errorReason = reason;
				828	if(parseError != NULL) { setErrorContext(); }
				829	}
				830
				831	void
				832	CollationRuleParser::setErrorContext() {
				833	if(parseError == NULL) { return; }
				834
				835	// Note: This relies on the calling code maintaining the ruleIndex
				836	// at a position that is useful for debugging.
				837	// For example, at the beginning of a reset or relation etc.
				838	parseError->offset = ruleIndex;
				839	parseError->line = 0; // We are not counting line numbers.
				840
				841	// before ruleIndex
				842	int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
				843	if(start < 0) {
				844	start = 0;
				845	} else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
				846	++start;
				847	}
				848	int32_t length = ruleIndex - start;
				849	rules->extract(start, length, parseError->preContext);
				850	parseError->preContext[length] = 0;
				851
				852	// starting from ruleIndex
				853	length = rules->length() - ruleIndex;
				854	if(length >= U_PARSE_CONTEXT_LEN) {
				855	length = U_PARSE_CONTEXT_LEN - 1;
				856	if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
				857	--length;
				858	}
				859	}
				860	rules->extract(ruleIndex, length, parseError->postContext);
				861	parseError->postContext[length] = 0;
				862	}
				863
				864	UBool
				865	CollationRuleParser::isSyntaxChar(UChar32 c) {
				866	return 0x21 <= c && c <= 0x7e &&
				867	(c <= 0x2f \|\| (0x3a <= c && c <= 0x40) \|\|
				868	(0x5b <= c && c <= 0x60) \|\| (0x7b <= c));
				869	}
				870
				871	int32_t
				872	CollationRuleParser::skipWhiteSpace(int32_t i) const {
				873	while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
				874	++i;
				875	}
				876	return i;
				877	}
				878
				879	U_NAMESPACE_END
				880
				881	#endif // !UCONFIG_NO_COLLATION