Blame - source/i18n/collationiterator.cpp - chromium.googlesource.com/chromium/deps/icu

blob: a47b3d86bea99a46e5b36d08444412c8ff9d0395 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	3	/*
				4	*******************************************************************************
				5	* Copyright (C) 2010-2014, International Business Machines
				6	* Corporation and others. All Rights Reserved.
				7	*******************************************************************************
				8	* collationiterator.cpp
				9	*
				10	* created on: 2010oct27
				11	* created by: Markus W. Scherer
				12	*/
				13
				14	#include "utypeinfo.h" // for 'typeid' to work
				15
				16	#include "unicode/utypes.h"
				17
				18	#if !UCONFIG_NO_COLLATION
				19
				20	#include "unicode/ucharstrie.h"
				21	#include "unicode/ustringtrie.h"
				22	#include "charstr.h"
				23	#include "cmemory.h"
				24	#include "collation.h"
				25	#include "collationdata.h"
				26	#include "collationfcd.h"
				27	#include "collationiterator.h"
				28	#include "normalizer2impl.h"
				29	#include "uassert.h"
				30	#include "uvectr32.h"
				31
				32	U_NAMESPACE_BEGIN
				33
				34	CollationIterator::CEBuffer::~CEBuffer() {}
				35
				36	UBool
				37	CollationIterator::CEBuffer::ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode) {
				38	int32_t capacity = buffer.getCapacity();
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	39	if((length + appCap) <= capacity) { return true; }
				40	if(U_FAILURE(errorCode)) { return false; }
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	41	do {
				42	if(capacity < 1000) {
				43	capacity *= 4;
				44	} else {
				45	capacity *= 2;
				46	}
				47	} while(capacity < (length + appCap));
				48	int64_t *p = buffer.resize(capacity, length);
				49	if(p == NULL) {
				50	errorCode = U_MEMORY_ALLOCATION_ERROR;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	51	return false;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	52	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	53	return true;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	54	}
				55
				56	// State of combining marks skipped in discontiguous contraction.
				57	// We create a state object on first use and keep it around deactivated between uses.
				58	class SkippedState : public UMemory {
				59	public:
				60	// Born active but empty.
				61	SkippedState() : pos(0), skipLengthAtMatch(0) {}
				62	void clear() {
				63	oldBuffer.remove();
				64	pos = 0;
				65	// The newBuffer is reset by setFirstSkipped().
				66	}
				67
				68	UBool isEmpty() const { return oldBuffer.isEmpty(); }
				69
				70	UBool hasNext() const { return pos < oldBuffer.length(); }
				71
				72	// Requires hasNext().
				73	UChar32 next() {
				74	UChar32 c = oldBuffer.char32At(pos);
				75	pos += U16_LENGTH(c);
				76	return c;
				77	}
				78
				79	// Accounts for one more input code point read beyond the end of the marks buffer.
				80	void incBeyond() {
				81	U_ASSERT(!hasNext());
				82	++pos;
				83	}
				84
				85	// Goes backward through the skipped-marks buffer.
				86	// Returns the number of code points read beyond the skipped marks
				87	// that need to be backtracked through normal input.
				88	int32_t backwardNumCodePoints(int32_t n) {
				89	int32_t length = oldBuffer.length();
				90	int32_t beyond = pos - length;
				91	if(beyond > 0) {
				92	if(beyond >= n) {
				93	// Not back far enough to re-enter the oldBuffer.
				94	pos -= n;
				95	return n;
				96	} else {
				97	// Back out all beyond-oldBuffer code points and re-enter the buffer.
				98	pos = oldBuffer.moveIndex32(length, beyond - n);
				99	return beyond;
				100	}
				101	} else {
				102	// Go backwards from inside the oldBuffer.
				103	pos = oldBuffer.moveIndex32(pos, -n);
				104	return 0;
				105	}
				106	}
				107
				108	void setFirstSkipped(UChar32 c) {
				109	skipLengthAtMatch = 0;
				110	newBuffer.setTo(c);
				111	}
				112
				113	void skip(UChar32 c) {
				114	newBuffer.append(c);
				115	}
				116
				117	void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
				118
				119	// Replaces the characters we consumed with the newly skipped ones.
				120	void replaceMatch() {
				121	// Note: UnicodeString.replace() pins pos to at most length().
				122	oldBuffer.replace(0, pos, newBuffer, 0, skipLengthAtMatch);
				123	pos = 0;
				124	}
				125
				126	void saveTrieState(const UCharsTrie &trie) { trie.saveState(state); }
				127	void resetToTrieState(UCharsTrie &trie) const { trie.resetToState(state); }
				128
				129	private:
				130	// Combining marks skipped in previous discontiguous-contraction matching.
				131	// After that discontiguous contraction was completed, we start reading them from here.
				132	UnicodeString oldBuffer;
				133	// Combining marks newly skipped in current discontiguous-contraction matching.
				134	// These might have been read from the normal text or from the oldBuffer.
				135	UnicodeString newBuffer;
				136	// Reading index in oldBuffer,
				137	// or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
				138	int32_t pos;
				139	// newBuffer.length() at the time of the last matching character.
				140	// When a partial match fails, we back out skipped and partial-matching input characters.
				141	int32_t skipLengthAtMatch;
				142	// We save the trie state before we attempt to match a character,
				143	// so that we can skip it and try the next one.
				144	UCharsTrie::State state;
				145	};
				146
				147	CollationIterator::CollationIterator(const CollationIterator &other)
				148	: UObject(other),
				149	trie(other.trie),
				150	data(other.data),
				151	cesIndex(other.cesIndex),
				152	skipped(NULL),
				153	numCpFwd(other.numCpFwd),
				154	isNumeric(other.isNumeric) {
				155	UErrorCode errorCode = U_ZERO_ERROR;
				156	int32_t length = other.ceBuffer.length;
				157	if(length > 0 && ceBuffer.ensureAppendCapacity(length, errorCode)) {
				158	for(int32_t i = 0; i < length; ++i) {
				159	ceBuffer.set(i, other.ceBuffer.get(i));
				160	}
				161	ceBuffer.length = length;
				162	} else {
				163	cesIndex = 0;
				164	}
				165	}
				166
				167	CollationIterator::~CollationIterator() {
				168	delete skipped;
				169	}
				170
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	171	bool
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	172	CollationIterator::operator==(const CollationIterator &other) const {
				173	// Subclasses: Call this method and then add more specific checks.
				174	// Compare the iterator state but not the collation data (trie & data fields):
				175	// Assume that the caller compares the data.
				176	// Ignore skipped since that should be unused between calls to nextCE().
				177	// (It only stays around to avoid another memory allocation.)
				178	if(!(typeid(*this) == typeid(other) &&
				179	ceBuffer.length == other.ceBuffer.length &&
				180	cesIndex == other.cesIndex &&
				181	numCpFwd == other.numCpFwd &&
				182	isNumeric == other.isNumeric)) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	183	return false;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	184	}
				185	for(int32_t i = 0; i < ceBuffer.length; ++i) {
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	186	if(ceBuffer.get(i) != other.ceBuffer.get(i)) { return false; }
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	187	}
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	188	return true;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	189	}
				190
				191	void
				192	CollationIterator::reset() {
				193	cesIndex = ceBuffer.length = 0;
				194	if(skipped != NULL) { skipped->clear(); }
				195	}
				196
				197	int32_t
				198	CollationIterator::fetchCEs(UErrorCode &errorCode) {
				199	while(U_SUCCESS(errorCode) && nextCE(errorCode) != Collation::NO_CE) {
				200	// No need to loop for each expansion CE.
				201	cesIndex = ceBuffer.length;
				202	}
				203	return ceBuffer.length;
				204	}
				205
				206	uint32_t
				207	CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
				208	c = nextCodePoint(errorCode);
				209	return (c < 0) ? Collation::FALLBACK_CE32 : data->getCE32(c);
				210	}
				211
				212	UChar
				213	CollationIterator::handleGetTrailSurrogate() {
				214	return 0;
				215	}
				216
				217	UBool
				218	CollationIterator::foundNULTerminator() {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	219	return false;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	220	}
				221
				222	UBool
				223	CollationIterator::forbidSurrogateCodePoints() const {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	224	return false;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	225	}
				226
				227	uint32_t
				228	CollationIterator::getDataCE32(UChar32 c) const {
				229	return data->getCE32(c);
				230	}
				231
				232	uint32_t
				233	CollationIterator::getCE32FromBuilderData(uint32_t /ce32/, UErrorCode &errorCode) {
				234	if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
				235	return 0;
				236	}
				237
				238	int64_t
				239	CollationIterator::nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
				240	UErrorCode &errorCode) {
				241	--ceBuffer.length; // Undo ceBuffer.incLength().
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	242	appendCEsFromCE32(d, c, ce32, true, errorCode);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	243	if(U_SUCCESS(errorCode)) {
				244	return ceBuffer.get(cesIndex++);
				245	} else {
				246	return Collation::NO_CE_PRIMARY;
				247	}
				248	}
				249
				250	void
				251	CollationIterator::appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
				252	UBool forward, UErrorCode &errorCode) {
				253	while(Collation::isSpecialCE32(ce32)) {
				254	switch(Collation::tagFromCE32(ce32)) {
				255	case Collation::FALLBACK_TAG:
				256	case Collation::RESERVED_TAG_3:
				257	if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
				258	return;
				259	case Collation::LONG_PRIMARY_TAG:
				260	ceBuffer.append(Collation::ceFromLongPrimaryCE32(ce32), errorCode);
				261	return;
				262	case Collation::LONG_SECONDARY_TAG:
				263	ceBuffer.append(Collation::ceFromLongSecondaryCE32(ce32), errorCode);
				264	return;
				265	case Collation::LATIN_EXPANSION_TAG:
				266	if(ceBuffer.ensureAppendCapacity(2, errorCode)) {
				267	ceBuffer.set(ceBuffer.length, Collation::latinCE0FromCE32(ce32));
				268	ceBuffer.set(ceBuffer.length + 1, Collation::latinCE1FromCE32(ce32));
				269	ceBuffer.length += 2;
				270	}
				271	return;
				272	case Collation::EXPANSION32_TAG: {
				273	const uint32_t *ce32s = d->ce32s + Collation::indexFromCE32(ce32);
				274	int32_t length = Collation::lengthFromCE32(ce32);
				275	if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
				276	do {
				277	ceBuffer.appendUnsafe(Collation::ceFromCE32(*ce32s++));
				278	} while(--length > 0);
				279	}
				280	return;
				281	}
				282	case Collation::EXPANSION_TAG: {
				283	const int64_t *ces = d->ces + Collation::indexFromCE32(ce32);
				284	int32_t length = Collation::lengthFromCE32(ce32);
				285	if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
				286	do {
				287	ceBuffer.appendUnsafe(*ces++);
				288	} while(--length > 0);
				289	}
				290	return;
				291	}
				292	case Collation::BUILDER_DATA_TAG:
				293	ce32 = getCE32FromBuilderData(ce32, errorCode);
				294	if(U_FAILURE(errorCode)) { return; }
				295	if(ce32 == Collation::FALLBACK_CE32) {
				296	d = data->base;
				297	ce32 = d->getCE32(c);
				298	}
				299	break;
				300	case Collation::PREFIX_TAG:
				301	if(forward) { backwardNumCodePoints(1, errorCode); }
				302	ce32 = getCE32FromPrefix(d, ce32, errorCode);
				303	if(forward) { forwardNumCodePoints(1, errorCode); }
				304	break;
				305	case Collation::CONTRACTION_TAG: {
				306	const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
				307	uint32_t defaultCE32 = CollationData::readCE32(p); // Default if no suffix match.
				308	if(!forward) {
				309	// Backward contractions are handled by previousCEUnsafe().
				310	// c has contractions but they were not found.
				311	ce32 = defaultCE32;
				312	break;
				313	}
				314	UChar32 nextCp;
				315	if(skipped == NULL && numCpFwd < 0) {
				316	// Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
				317	// avoiding the function call and the nextSkippedCodePoint() overhead.
				318	nextCp = nextCodePoint(errorCode);
				319	if(nextCp < 0) {
				320	// No more text.
				321	ce32 = defaultCE32;
				322	break;
				323	} else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
				324	!CollationFCD::mayHaveLccc(nextCp)) {
				325	// All contraction suffixes start with characters with lccc!=0
				326	// but the next code point has lccc==0.
				327	backwardNumCodePoints(1, errorCode);
				328	ce32 = defaultCE32;
				329	break;
				330	}
				331	} else {
				332	nextCp = nextSkippedCodePoint(errorCode);
				333	if(nextCp < 0) {
				334	// No more text.
				335	ce32 = defaultCE32;
				336	break;
				337	} else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
				338	!CollationFCD::mayHaveLccc(nextCp)) {
				339	// All contraction suffixes start with characters with lccc!=0
				340	// but the next code point has lccc==0.
				341	backwardNumSkipped(1, errorCode);
				342	ce32 = defaultCE32;
				343	break;
				344	}
				345	}
				346	ce32 = nextCE32FromContraction(d, ce32, p + 2, defaultCE32, nextCp, errorCode);
				347	if(ce32 == Collation::NO_CE32) {
				348	// CEs from a discontiguous contraction plus the skipped combining marks
				349	// have been appended already.
				350	return;
				351	}
				352	break;
				353	}
				354	case Collation::DIGIT_TAG:
				355	if(isNumeric) {
				356	appendNumericCEs(ce32, forward, errorCode);
				357	return;
				358	} else {
				359	// Fetch the non-numeric-collation CE32 and continue.
				360	ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
				361	break;
				362	}
				363	case Collation::U0000_TAG:
				364	U_ASSERT(c == 0);
				365	if(forward && foundNULTerminator()) {
				366	// Handle NUL-termination. (Not needed in Java.)
				367	ceBuffer.append(Collation::NO_CE, errorCode);
				368	return;
				369	} else {
				370	// Fetch the normal ce32 for U+0000 and continue.
				371	ce32 = d->ce32s[0];
				372	break;
				373	}
				374	case Collation::HANGUL_TAG: {
				375	const uint32_t *jamoCE32s = d->jamoCE32s;
				376	c -= Hangul::HANGUL_BASE;
				377	UChar32 t = c % Hangul::JAMO_T_COUNT;
				378	c /= Hangul::JAMO_T_COUNT;
				379	UChar32 v = c % Hangul::JAMO_V_COUNT;
				380	c /= Hangul::JAMO_V_COUNT;
				381	if((ce32 & Collation::HANGUL_NO_SPECIAL_JAMO) != 0) {
				382	// None of the Jamo CE32s are isSpecialCE32().
				383	// Avoid recursive function calls and per-Jamo tests.
				384	if(ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3, errorCode)) {
				385	ceBuffer.set(ceBuffer.length, Collation::ceFromCE32(jamoCE32s[c]));
				386	ceBuffer.set(ceBuffer.length + 1, Collation::ceFromCE32(jamoCE32s[19 + v]));
				387	ceBuffer.length += 2;
				388	if(t != 0) {
				389	ceBuffer.appendUnsafe(Collation::ceFromCE32(jamoCE32s[39 + t]));
				390	}
				391	}
				392	return;
				393	} else {
				394	// We should not need to compute each Jamo code point.
				395	// In particular, there should be no offset or implicit ce32.
				396	appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[c], forward, errorCode);
				397	appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[19 + v], forward, errorCode);
				398	if(t == 0) { return; }
				399	// offset 39 = 19 + 21 - 1:
				400	// 19 = JAMO_L_COUNT
				401	// 21 = JAMO_T_COUNT
				402	// -1 = omit t==0
				403	ce32 = jamoCE32s[39 + t];
				404	c = U_SENTINEL;
				405	break;
				406	}
				407	}
				408	case Collation::LEAD_SURROGATE_TAG: {
				409	U_ASSERT(forward); // Backward iteration should never see lead surrogate code _unit_ data.
				410	U_ASSERT(U16_IS_LEAD(c));
				411	UChar trail;
				412	if(U16_IS_TRAIL(trail = handleGetTrailSurrogate())) {
				413	c = U16_GET_SUPPLEMENTARY(c, trail);
				414	ce32 &= Collation::LEAD_TYPE_MASK;
				415	if(ce32 == Collation::LEAD_ALL_UNASSIGNED) {
				416	ce32 = Collation::UNASSIGNED_CE32; // unassigned-implicit
				417	} else if(ce32 == Collation::LEAD_ALL_FALLBACK \|\|
				418	(ce32 = d->getCE32FromSupplementary(c)) == Collation::FALLBACK_CE32) {
				419	// fall back to the base data
				420	d = d->base;
				421	ce32 = d->getCE32FromSupplementary(c);
				422	}
				423	} else {
				424	// c is an unpaired surrogate.
				425	ce32 = Collation::UNASSIGNED_CE32;
				426	}
				427	break;
				428	}
				429	case Collation::OFFSET_TAG:
				430	U_ASSERT(c >= 0);
				431	ceBuffer.append(d->getCEFromOffsetCE32(c, ce32), errorCode);
				432	return;
				433	case Collation::IMPLICIT_TAG:
				434	U_ASSERT(c >= 0);
				435	if(U_IS_SURROGATE(c) && forbidSurrogateCodePoints()) {
				436	ce32 = Collation::FFFD_CE32;
				437	break;
				438	} else {
				439	ceBuffer.append(Collation::unassignedCEFromCodePoint(c), errorCode);
				440	return;
				441	}
				442	}
				443	}
				444	ceBuffer.append(Collation::ceFromSimpleCE32(ce32), errorCode);
				445	}
				446
				447	uint32_t
				448	CollationIterator::getCE32FromPrefix(const CollationData *d, uint32_t ce32,
				449	UErrorCode &errorCode) {
				450	const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
				451	ce32 = CollationData::readCE32(p); // Default if no prefix match.
				452	p += 2;
				453	// Number of code points read before the original code point.
				454	int32_t lookBehind = 0;
				455	UCharsTrie prefixes(p);
				456	for(;;) {
				457	UChar32 c = previousCodePoint(errorCode);
				458	if(c < 0) { break; }
				459	++lookBehind;
				460	UStringTrieResult match = prefixes.nextForCodePoint(c);
				461	if(USTRINGTRIE_HAS_VALUE(match)) {
				462	ce32 = (uint32_t)prefixes.getValue();
				463	}
				464	if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
				465	}
				466	forwardNumCodePoints(lookBehind, errorCode);
				467	return ce32;
				468	}
				469
				470	UChar32
				471	CollationIterator::nextSkippedCodePoint(UErrorCode &errorCode) {
				472	if(skipped != NULL && skipped->hasNext()) { return skipped->next(); }
				473	if(numCpFwd == 0) { return U_SENTINEL; }
				474	UChar32 c = nextCodePoint(errorCode);
				475	if(skipped != NULL && !skipped->isEmpty() && c >= 0) { skipped->incBeyond(); }
				476	if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
				477	return c;
				478	}
				479
				480	void
				481	CollationIterator::backwardNumSkipped(int32_t n, UErrorCode &errorCode) {
				482	if(skipped != NULL && !skipped->isEmpty()) {
				483	n = skipped->backwardNumCodePoints(n);
				484	}
				485	backwardNumCodePoints(n, errorCode);
				486	if(numCpFwd >= 0) { numCpFwd += n; }
				487	}
				488
				489	uint32_t
				490	CollationIterator::nextCE32FromContraction(const CollationData *d, uint32_t contractionCE32,
				491	const UChar *p, uint32_t ce32, UChar32 c,
				492	UErrorCode &errorCode) {
				493	// c: next code point after the original one
				494
				495	// Number of code points read beyond the original code point.
				496	// Needed for discontiguous contraction matching.
				497	int32_t lookAhead = 1;
				498	// Number of code points read since the last match (initially only c).
				499	int32_t sinceMatch = 1;
				500	// Normally we only need a contiguous match,
				501	// and therefore need not remember the suffixes state from before a mismatch for retrying.
				502	// If we are already processing skipped combining marks, then we do track the state.
				503	UCharsTrie suffixes(p);
				504	if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
				505	UStringTrieResult match = suffixes.firstForCodePoint(c);
				506	for(;;) {
				507	UChar32 nextCp;
				508	if(USTRINGTRIE_HAS_VALUE(match)) {
				509	ce32 = (uint32_t)suffixes.getValue();
				510	if(!USTRINGTRIE_HAS_NEXT(match) \|\| (c = nextSkippedCodePoint(errorCode)) < 0) {
				511	return ce32;
				512	}
				513	if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
				514	sinceMatch = 1;
				515	} else if(match == USTRINGTRIE_NO_MATCH \|\| (nextCp = nextSkippedCodePoint(errorCode)) < 0) {
				516	// No match for c, or partial match (USTRINGTRIE_NO_VALUE) and no further text.
				517	// Back up if necessary, and try a discontiguous contraction.
				518	if((contractionCE32 & Collation::CONTRACT_TRAILING_CCC) != 0 &&
				519	// Discontiguous contraction matching extends an existing match.
				520	// If there is no match yet, then there is nothing to do.
				521	((contractionCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) == 0 \|\|
				522	sinceMatch < lookAhead)) {
				523	// The last character of at least one suffix has lccc!=0,
				524	// allowing for discontiguous contractions.
				525	// UCA S2.1.1 only processes non-starters immediately following
				526	// "a match in the table" (sinceMatch=1).
				527	if(sinceMatch > 1) {
				528	// Return to the state after the last match.
				529	// (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
				530	backwardNumSkipped(sinceMatch, errorCode);
				531	c = nextSkippedCodePoint(errorCode);
				532	lookAhead -= sinceMatch - 1;
				533	sinceMatch = 1;
				534	}
				535	if(d->getFCD16(c) > 0xff) {
				536	return nextCE32FromDiscontiguousContraction(
				537	d, suffixes, ce32, lookAhead, c, errorCode);
				538	}
				539	}
				540	break;
				541	} else {
				542	// Continue after partial match (USTRINGTRIE_NO_VALUE) for c.
				543	// It does not have a result value, therefore it is not itself "a match in the table".
				544	// If a partially-matched c has ccc!=0 then
				545	// it might be skipped in discontiguous contraction.
				546	c = nextCp;
				547	++sinceMatch;
				548	}
				549	++lookAhead;
				550	match = suffixes.nextForCodePoint(c);
				551	}
				552	backwardNumSkipped(sinceMatch, errorCode);
				553	return ce32;
				554	}
				555
				556	uint32_t
				557	CollationIterator::nextCE32FromDiscontiguousContraction(
				558	const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,
				559	int32_t lookAhead, UChar32 c,
				560	UErrorCode &errorCode) {
				561	if(U_FAILURE(errorCode)) { return 0; }
				562
				563	// UCA section 3.3.2 Contractions:
				564	// Contractions that end with non-starter characters
				565	// are known as discontiguous contractions.
				566	// ... discontiguous contractions must be detected in input text
				567	// whenever the final sequence of non-starter characters could be rearranged
				568	// so as to make a contiguous matching sequence that is canonically equivalent.
				569
				570	// UCA: http://www.unicode.org/reports/tr10/#S2.1
				571	// S2.1 Find the longest initial substring S at each point that has a match in the table.
				572	// S2.1.1 If there are any non-starters following S, process each non-starter C.
				573	// S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
				574	// Note: A non-starter in a string is called blocked
				575	// if there is another non-starter of the same canonical combining class or zero
				576	// between it and the last character of canonical combining class 0.
				577	// S2.1.3 If there is a match, replace S by S + C, and remove C.
				578
				579	// First: Is a discontiguous contraction even possible?
				580	uint16_t fcd16 = d->getFCD16(c);
				581	U_ASSERT(fcd16 > 0xff); // The caller checked this already, as a shortcut.
				582	UChar32 nextCp = nextSkippedCodePoint(errorCode);
				583	if(nextCp < 0) {
				584	// No further text.
				585	backwardNumSkipped(1, errorCode);
				586	return ce32;
				587	}
				588	++lookAhead;
				589	uint8_t prevCC = (uint8_t)fcd16;
				590	fcd16 = d->getFCD16(nextCp);
				591	if(fcd16 <= 0xff) {
				592	// The next code point after c is a starter (S2.1.1 "process each non-starter").
				593	backwardNumSkipped(2, errorCode);
				594	return ce32;
				595	}
				596
				597	// We have read and matched (lookAhead-2) code points,
				598	// read non-matching c and peeked ahead at nextCp.
				599	// Return to the state before the mismatch and continue matching with nextCp.
				600	if(skipped == NULL \|\| skipped->isEmpty()) {
				601	if(skipped == NULL) {
				602	skipped = new SkippedState();
				603	if(skipped == NULL) {
				604	errorCode = U_MEMORY_ALLOCATION_ERROR;
				605	return 0;
				606	}
				607	}
				608	suffixes.reset();
				609	if(lookAhead > 2) {
				610	// Replay the partial match so far.
				611	backwardNumCodePoints(lookAhead, errorCode);
				612	suffixes.firstForCodePoint(nextCodePoint(errorCode));
				613	for(int32_t i = 3; i < lookAhead; ++i) {
				614	suffixes.nextForCodePoint(nextCodePoint(errorCode));
				615	}
				616	// Skip c (which did not match) and nextCp (which we will try now).
				617	forwardNumCodePoints(2, errorCode);
				618	}
				619	skipped->saveTrieState(suffixes);
				620	} else {
				621	// Reset to the trie state before the failed match of c.
				622	skipped->resetToTrieState(suffixes);
				623	}
				624
				625	skipped->setFirstSkipped(c);
				626	// Number of code points read since the last match (at this point: c and nextCp).
				627	int32_t sinceMatch = 2;
				628	c = nextCp;
				629	for(;;) {
				630	UStringTrieResult match;
				631	// "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
				632	if(prevCC < (fcd16 >> 8) && USTRINGTRIE_HAS_VALUE(match = suffixes.nextForCodePoint(c))) {
				633	// "If there is a match, replace S by S + C, and remove C." (S2.1.3)
				634	// Keep prevCC unchanged.
				635	ce32 = (uint32_t)suffixes.getValue();
				636	sinceMatch = 0;
				637	skipped->recordMatch();
				638	if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
				639	skipped->saveTrieState(suffixes);
				640	} else {
				641	// No match for "S + C", skip C.
				642	skipped->skip(c);
				643	skipped->resetToTrieState(suffixes);
				644	prevCC = (uint8_t)fcd16;
				645	}
				646	if((c = nextSkippedCodePoint(errorCode)) < 0) { break; }
				647	++sinceMatch;
				648	fcd16 = d->getFCD16(c);
				649	if(fcd16 <= 0xff) {
				650	// The next code point after c is a starter (S2.1.1 "process each non-starter").
				651	break;
				652	}
				653	}
				654	backwardNumSkipped(sinceMatch, errorCode);
				655	UBool isTopDiscontiguous = skipped->isEmpty();
				656	skipped->replaceMatch();
				657	if(isTopDiscontiguous && !skipped->isEmpty()) {
				658	// We did get a match after skipping one or more combining marks,
				659	// and we are not in a recursive discontiguous contraction.
				660	// Append CEs from the contraction ce32
				661	// and then from the combining marks that we skipped before the match.
				662	c = U_SENTINEL;
				663	for(;;) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	664	appendCEsFromCE32(d, c, ce32, true, errorCode);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	665	// Fetch CE32s for skipped combining marks from the normal data, with fallback,
				666	// rather than from the CollationData where we found the contraction.
				667	if(!skipped->hasNext()) { break; }
				668	c = skipped->next();
				669	ce32 = getDataCE32(c);
				670	if(ce32 == Collation::FALLBACK_CE32) {
				671	d = data->base;
				672	ce32 = d->getCE32(c);
				673	} else {
				674	d = data;
				675	}
				676	// Note: A nested discontiguous-contraction match
				677	// replaces consumed combining marks with newly skipped ones
				678	// and resets the reading position to the beginning.
				679	}
				680	skipped->clear();
				681	ce32 = Collation::NO_CE32; // Signal to the caller that the result is in the ceBuffer.
				682	}
				683	return ce32;
				684	}
				685
				686	void
				687	CollationIterator::appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode) {
				688	// Collect digits.
				689	CharString digits;
				690	if(forward) {
				691	for(;;) {
				692	char digit = Collation::digitFromCE32(ce32);
				693	digits.append(digit, errorCode);
				694	if(numCpFwd == 0) { break; }
				695	UChar32 c = nextCodePoint(errorCode);
				696	if(c < 0) { break; }
				697	ce32 = data->getCE32(c);
				698	if(ce32 == Collation::FALLBACK_CE32) {
				699	ce32 = data->base->getCE32(c);
				700	}
				701	if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
				702	backwardNumCodePoints(1, errorCode);
				703	break;
				704	}
				705	if(numCpFwd > 0) { --numCpFwd; }
				706	}
				707	} else {
				708	for(;;) {
				709	char digit = Collation::digitFromCE32(ce32);
				710	digits.append(digit, errorCode);
				711	UChar32 c = previousCodePoint(errorCode);
				712	if(c < 0) { break; }
				713	ce32 = data->getCE32(c);
				714	if(ce32 == Collation::FALLBACK_CE32) {
				715	ce32 = data->base->getCE32(c);
				716	}
				717	if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
				718	forwardNumCodePoints(1, errorCode);
				719	break;
				720	}
				721	}
				722	// Reverse the digit string.
				723	char *p = digits.data();
				724	char *q = p + digits.length() - 1;
				725	while(p < q) {
				726	char digit = *p;
				727	p++ = q;
				728	*q-- = digit;
				729	}
				730	}
				731	if(U_FAILURE(errorCode)) { return; }
				732	int32_t pos = 0;
				733	do {
				734	// Skip leading zeros.
				735	while(pos < (digits.length() - 1) && digits[pos] == 0) { ++pos; }
				736	// Write a sequence of CEs for at most 254 digits at a time.
				737	int32_t segmentLength = digits.length() - pos;
				738	if(segmentLength > 254) { segmentLength = 254; }
				739	appendNumericSegmentCEs(digits.data() + pos, segmentLength, errorCode);
				740	pos += segmentLength;
				741	} while(U_SUCCESS(errorCode) && pos < digits.length());
				742	}
				743
				744	void
				745	CollationIterator::appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode) {
				746	U_ASSERT(1 <= length && length <= 254);
				747	U_ASSERT(length == 1 \|\| digits[0] != 0);
				748	uint32_t numericPrimary = data->numericPrimary;
				749	// Note: We use primary byte values 2..255: digits are not compressible.
				750	if(length <= 7) {
				751	// Very dense encoding for small numbers.
				752	int32_t value = digits[0];
				753	for(int32_t i = 1; i < length; ++i) {
				754	value = value * 10 + digits[i];
				755	}
				756	// Primary weight second byte values:
				757	// 74 byte values 2.. 75 for small numbers in two-byte primary weights.
				758	// 40 byte values 76..115 for medium numbers in three-byte primary weights.
				759	// 16 byte values 116..131 for large numbers in four-byte primary weights.
				760	// 124 byte values 132..255 for very large numbers with 4..127 digit pairs.
				761	int32_t firstByte = 2;
				762	int32_t numBytes = 74;
				763	if(value < numBytes) {
				764	// Two-byte primary for 0..73, good for day & month numbers etc.
				765	uint32_t primary = numericPrimary \| ((firstByte + value) << 16);
				766	ceBuffer.append(Collation::makeCE(primary), errorCode);
				767	return;
				768	}
				769	value -= numBytes;
				770	firstByte += numBytes;
				771	numBytes = 40;
				772	if(value < numBytes * 254) {
				773	// Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
				774	uint32_t primary = numericPrimary \|
				775	((firstByte + value / 254) << 16) \| ((2 + value % 254) << 8);
				776	ceBuffer.append(Collation::makeCE(primary), errorCode);
				777	return;
				778	}
				779	value -= numBytes * 254;
				780	firstByte += numBytes;
				781	numBytes = 16;
				782	if(value < numBytes * 254 * 254) {
				783	// Four-byte primary for 10234..1042489=10234+16254254-1.
				784	uint32_t primary = numericPrimary \| (2 + value % 254);
				785	value /= 254;
				786	primary \|= (2 + value % 254) << 8;
				787	value /= 254;
				788	primary \|= (firstByte + value % 254) << 16;
				789	ceBuffer.append(Collation::makeCE(primary), errorCode);
				790	return;
				791	}
				792	// original value > 1042489
				793	}
				794	U_ASSERT(length >= 7);
				795
				796	// The second primary byte value 132..255 indicates the number of digit pairs (4..127),
				797	// then we generate primary bytes with those pairs.
				798	// Omit trailing 00 pairs.
				799	// Decrement the value for the last pair.
				800
				801	// Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255.
				802	int32_t numPairs = (length + 1) / 2;
				803	uint32_t primary = numericPrimary \| ((132 - 4 + numPairs) << 16);
				804	// Find the length without trailing 00 pairs.
				805	while(digits[length - 1] == 0 && digits[length - 2] == 0) {
				806	length -= 2;
				807	}
				808	// Read the first pair.
				809	uint32_t pair;
				810	int32_t pos;
				811	if(length & 1) {
				812	// Only "half a pair" if we have an odd number of digits.
				813	pair = digits[0];
				814	pos = 1;
				815	} else {
				816	pair = digits[0] * 10 + digits[1];
				817	pos = 2;
				818	}
				819	pair = 11 + 2 * pair;
				820	// Add the pairs of digits between pos and length.
				821	int32_t shift = 8;
				822	while(pos < length) {
				823	if(shift == 0) {
				824	// Every three pairs/bytes we need to store a 4-byte-primary CE
				825	// and start with a new CE with the '0' primary lead byte.
				826	primary \|= pair;
				827	ceBuffer.append(Collation::makeCE(primary), errorCode);
				828	primary = numericPrimary;
				829	shift = 16;
				830	} else {
				831	primary \|= pair << shift;
				832	shift -= 8;
				833	}
				834	pair = 11 + 2 * (digits[pos] * 10 + digits[pos + 1]);
				835	pos += 2;
				836	}
				837	primary \|= (pair - 1) << shift;
				838	ceBuffer.append(Collation::makeCE(primary), errorCode);
				839	}
				840
				841	int64_t
				842	CollationIterator::previousCE(UVector32 &offsets, UErrorCode &errorCode) {
				843	if(ceBuffer.length > 0) {
				844	// Return the previous buffered CE.
				845	return ceBuffer.get(--ceBuffer.length);
				846	}
				847	offsets.removeAllElements();
				848	int32_t limitOffset = getOffset();
				849	UChar32 c = previousCodePoint(errorCode);
				850	if(c < 0) { return Collation::NO_CE; }
				851	if(data->isUnsafeBackward(c, isNumeric)) {
				852	return previousCEUnsafe(c, offsets, errorCode);
				853	}
				854	// Simple, safe-backwards iteration:
				855	// Get a CE going backwards, handle prefixes but no contractions.
				856	uint32_t ce32 = data->getCE32(c);
				857	const CollationData *d;
				858	if(ce32 == Collation::FALLBACK_CE32) {
				859	d = data->base;
				860	ce32 = d->getCE32(c);
				861	} else {
				862	d = data;
				863	}
				864	if(Collation::isSimpleOrLongCE32(ce32)) {
				865	return Collation::ceFromCE32(ce32);
				866	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	867	appendCEsFromCE32(d, c, ce32, false, errorCode);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	868	if(U_SUCCESS(errorCode)) {
				869	if(ceBuffer.length > 1) {
				870	offsets.addElement(getOffset(), errorCode);
				871	// For an expansion, the offset of each non-initial CE is the limit offset,
				872	// consistent with forward iteration.
				873	while(offsets.size() <= ceBuffer.length) {
				874	offsets.addElement(limitOffset, errorCode);
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	875	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	876	}
				877	return ceBuffer.get(--ceBuffer.length);
				878	} else {
				879	return Collation::NO_CE_PRIMARY;
				880	}
				881	}
				882
				883	int64_t
				884	CollationIterator::previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode) {
				885	// We just move through the input counting safe and unsafe code points
				886	// without collecting the unsafe-backward substring into a buffer and
				887	// switching to it.
				888	// This is to keep the logic simple. Otherwise we would have to handle
				889	// prefix matching going before the backward buffer, switching
				890	// to iteration and back, etc.
				891	// In the most important case of iterating over a normal string,
				892	// reading from the string itself is already maximally fast.
				893	// The only drawback there is that after getting the CEs we always
				894	// skip backward to the safe character rather than switching out
				895	// of a backwardBuffer.
				896	// But this should not be the common case for previousCE(),
				897	// and correctness and maintainability are more important than
				898	// complex optimizations.
				899	// Find the first safe character before c.
				900	int32_t numBackward = 1;
				901	while((c = previousCodePoint(errorCode)) >= 0) {
				902	++numBackward;
				903	if(!data->isUnsafeBackward(c, isNumeric)) {
				904	break;
				905	}
				906	}
				907	// Set the forward iteration limit.
				908	// Note: This counts code points.
				909	// We cannot enforce a limit in the middle of a surrogate pair or similar.
				910	numCpFwd = numBackward;
				911	// Reset the forward iterator.
				912	cesIndex = 0;
				913	U_ASSERT(ceBuffer.length == 0);
				914	// Go forward and collect the CEs.
				915	int32_t offset = getOffset();
				916	while(numCpFwd > 0) {
				917	// nextCE() normally reads one code point.
				918	// Contraction matching and digit specials read more and check numCpFwd.
				919	--numCpFwd;
				920	// Append one or more CEs to the ceBuffer.
				921	(void)nextCE(errorCode);
				922	U_ASSERT(U_FAILURE(errorCode) \|\| ceBuffer.get(ceBuffer.length - 1) != Collation::NO_CE);
				923	// No need to loop for getting each expansion CE from nextCE().
				924	cesIndex = ceBuffer.length;
				925	// However, we need to write an offset for each CE.
				926	// This is for CollationElementIterator::getOffset() to return
				927	// intermediate offsets from the unsafe-backwards segment.
				928	U_ASSERT(offsets.size() < ceBuffer.length);
				929	offsets.addElement(offset, errorCode);
				930	// For an expansion, the offset of each non-initial CE is the limit offset,
				931	// consistent with forward iteration.
				932	offset = getOffset();
				933	while(offsets.size() < ceBuffer.length) {
				934	offsets.addElement(offset, errorCode);
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	935	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	936	}
				937	U_ASSERT(offsets.size() == ceBuffer.length);
				938	// End offset corresponding to just after the unsafe-backwards segment.
				939	offsets.addElement(offset, errorCode);
				940	// Reset the forward iteration limit
				941	// and move backward to before the segment for which we fetched CEs.
				942	numCpFwd = -1;
				943	backwardNumCodePoints(numBackward, errorCode);
				944	// Use the collected CEs and return the last one.
				945	cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented.
				946	if(U_SUCCESS(errorCode)) {
				947	return ceBuffer.get(--ceBuffer.length);
				948	} else {
				949	return Collation::NO_CE_PRIMARY;
				950	}
				951	}
				952
				953	U_NAMESPACE_END
				954
				955	#endif // !UCONFIG_NO_COLLATION