Blame - source/i18n/collationfastlatinbuilder.cpp - chromium.googlesource.com/chromium/deps/icu

blob: fefed8600e7fae1be2b731966131c22acd4f5bbc [file] [log] [blame]

Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1	/*
				2	*******************************************************************************
				3	* Copyright (C) 2013-2014, International Business Machines
				4	* Corporation and others. All Rights Reserved.
				5	*******************************************************************************
				6	* collationfastlatinbuilder.cpp
				7	*
				8	* created on: 2013aug09
				9	* created by: Markus W. Scherer
				10	*/
				11
				12	#define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2
				13	#if DEBUG_COLLATION_FAST_LATIN_BUILDER
				14	#include <stdio.h>
				15	#include <string>
				16	#endif
				17
				18	#include "unicode/utypes.h"
				19
				20	#if !UCONFIG_NO_COLLATION
				21
				22	#include "unicode/ucol.h"
				23	#include "unicode/ucharstrie.h"
				24	#include "unicode/unistr.h"
				25	#include "unicode/uobject.h"
				26	#include "unicode/uscript.h"
				27	#include "cmemory.h"
				28	#include "collation.h"
				29	#include "collationdata.h"
				30	#include "collationfastlatin.h"
				31	#include "collationfastlatinbuilder.h"
				32	#include "uassert.h"
				33	#include "uvectr64.h"
				34
				35	U_NAMESPACE_BEGIN
				36
				37	struct CollationData;
				38
				39	namespace {
				40
				41	/**
				42	* Compare two signed int64_t values as if they were unsigned.
				43	*/
				44	int32_t
				45	compareInt64AsUnsigned(int64_t a, int64_t b) {
				46	if((uint64_t)a < (uint64_t)b) {
				47	return -1;
				48	} else if((uint64_t)a > (uint64_t)b) {
				49	return 1;
				50	} else {
				51	return 0;
				52	}
				53	}
				54
				55	// TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp
				56	/**
				57	* Like Java Collections.binarySearch(List, String, Comparator).
				58	*
				59	* @return the index>=0 where the item was found,
				60	* or the index<0 for inserting the string at ~index in sorted order
				61	*/
				62	int32_t
				63	binarySearch(const int64_t list[], int32_t limit, int64_t ce) {
				64	if (limit == 0) { return ~0; }
				65	int32_t start = 0;
				66	for (;;) {
				67	int32_t i = (start + limit) / 2;
				68	int32_t cmp = compareInt64AsUnsigned(ce, list[i]);
				69	if (cmp == 0) {
				70	return i;
				71	} else if (cmp < 0) {
				72	if (i == start) {
				73	return ~start; // insert ce before i
				74	}
				75	limit = i;
				76	} else {
				77	if (i == start) {
				78	return ~(start + 1); // insert ce after i
				79	}
				80	start = i;
				81	}
				82	}
				83	}
				84
				85	} // namespace
				86
				87	CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode &errorCode)
				88	: ce0(0), ce1(0),
				89	contractionCEs(errorCode), uniqueCEs(errorCode),
				90	miniCEs(NULL),
				91	firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0),
				92	firstShortPrimary(0), shortPrimaryOverflow(FALSE),
				93	headerLength(0) {
				94	}
				95
				96	CollationFastLatinBuilder::~CollationFastLatinBuilder() {
				97	uprv_free(miniCEs);
				98	}
				99
				100	UBool
				101	CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorCode) {
				102	if(U_FAILURE(errorCode)) { return FALSE; }
				103	if(!result.isEmpty()) { // This builder is not reusable.
				104	errorCode = U_INVALID_STATE_ERROR;
				105	return FALSE;
				106	}
				107	if(!loadGroups(data, errorCode)) { return FALSE; }
				108
				109	// Fast handling of digits.
				110	firstShortPrimary = firstDigitPrimary;
				111	getCEs(data, errorCode);
				112	if(!encodeUniqueCEs(errorCode)) { return FALSE; }
				113	if(shortPrimaryOverflow) {
				114	// Give digits long mini primaries,
				115	// so that there are more short primaries for letters.
				116	firstShortPrimary = firstLatinPrimary;
				117	resetCEs();
				118	getCEs(data, errorCode);
				119	if(!encodeUniqueCEs(errorCode)) { return FALSE; }
				120	}
				121	// Note: If we still have a short-primary overflow but not a long-primary overflow,
				122	// then we could calculate how many more long primaries would fit,
				123	// and set the firstShortPrimary to that many after the current firstShortPrimary,
				124	// and try again.
				125	// However, this might only benefit the en_US_POSIX tailoring,
				126	// and it is simpler to suppress building fast Latin data for it in genrb,
				127	// or by returning FALSE here if shortPrimaryOverflow.
				128
				129	UBool ok = !shortPrimaryOverflow &&
				130	encodeCharCEs(errorCode) && encodeContractions(errorCode);
				131	contractionCEs.removeAllElements(); // might reduce heap memory usage
				132	uniqueCEs.removeAllElements();
				133	return ok;
				134	}
				135
				136	UBool
				137	CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) {
				138	if(U_FAILURE(errorCode)) { return FALSE; }
				139	result.append(0); // reserved for version & headerLength
				140	// The first few reordering groups should be special groups
				141	// (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
				142	for(int32_t i = 0;;) {
				143	if(i >= data.scriptsLength) {
				144	// no Latn script
				145	errorCode = U_INTERNAL_PROGRAM_ERROR;
				146	return FALSE;
				147	}
				148	uint32_t head = data.scripts[i];
				149	uint32_t lastByte = head & 0xff; // last primary byte in the group
				150	int32_t group = data.scripts[i + 2];
				151	if(group == UCOL_REORDER_CODE_DIGIT) {
				152	firstDigitPrimary = (head & 0xff00) << 16;
				153	headerLength = result.length();
				154	uint32_t r0 = (CollationFastLatin::VERSION << 8) \| headerLength;
				155	result.setCharAt(0, (UChar)r0);
				156	} else if(group == USCRIPT_LATIN) {
				157	if(firstDigitPrimary == 0) {
				158	// no digit group
				159	errorCode = U_INTERNAL_PROGRAM_ERROR;
				160	return FALSE;
				161	}
				162	firstLatinPrimary = (head & 0xff00) << 16;
				163	lastLatinPrimary = (lastByte << 24) \| 0xffffff;
				164	break;
				165	} else if(firstDigitPrimary == 0) {
				166	// a group below digits
				167	if(lastByte > 0x7f) {
				168	// We only use 7 bits for the last byte of a below-digits group.
				169	// This does not warrant an errorCode, but we do not build a fast Latin table.
				170	return FALSE;
				171	}
				172	result.append((UChar)lastByte);
				173	}
				174	i = i + 2 + data.scripts[i + 1];
				175	}
				176	return TRUE;
				177	}
				178
				179	UBool
				180	CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const {
				181	// Both or neither need to be encoded as short primaries,
				182	// so that we can test only one and use the same bit mask.
				183	if(p >= firstShortPrimary) {
				184	return q >= firstShortPrimary;
				185	} else if(q >= firstShortPrimary) {
				186	return FALSE;
				187	}
				188	// Both or neither must be potentially-variable,
				189	// so that we can test only one and determine if both are variable.
				190	if(p >= firstDigitPrimary) {
				191	return q >= firstDigitPrimary;
				192	} else if(q >= firstDigitPrimary) {
				193	return FALSE;
				194	}
				195	// Both will be encoded with long mini primaries.
				196	// They must be in the same special reordering group,
				197	// so that we can test only one and determine if both are variable.
				198	p >>= 24; // first primary byte
				199	q >>= 24;
				200	U_ASSERT(p != 0 && q != 0);
				201	U_ASSERT(p <= result[headerLength - 1]); // the loop will terminate
				202	for(int32_t i = 1;; ++i) {
				203	uint32_t lastByte = result[i];
				204	if(p <= lastByte) {
				205	return q <= lastByte;
				206	} else if(q <= lastByte) {
				207	return FALSE;
				208	}
				209	}
				210	}
				211
				212	void
				213	CollationFastLatinBuilder::resetCEs() {
				214	contractionCEs.removeAllElements();
				215	uniqueCEs.removeAllElements();
				216	shortPrimaryOverflow = FALSE;
				217	result.truncate(headerLength);
				218	}
				219
				220	void
				221	CollationFastLatinBuilder::getCEs(const CollationData &data, UErrorCode &errorCode) {
				222	if(U_FAILURE(errorCode)) { return; }
				223	int32_t i = 0;
				224	for(UChar c = 0;; ++i, ++c) {
				225	if(c == CollationFastLatin::LATIN_LIMIT) {
				226	c = CollationFastLatin::PUNCT_START;
				227	} else if(c == CollationFastLatin::PUNCT_LIMIT) {
				228	break;
				229	}
				230	const CollationData *d;
				231	uint32_t ce32 = data.getCE32(c);
				232	if(ce32 == Collation::FALLBACK_CE32) {
				233	d = data.base;
				234	ce32 = d->getCE32(c);
				235	} else {
				236	d = &data;
				237	}
				238	if(getCEsFromCE32(*d, c, ce32, errorCode)) {
				239	charCEs[i][0] = ce0;
				240	charCEs[i][1] = ce1;
				241	addUniqueCE(ce0, errorCode);
				242	addUniqueCE(ce1, errorCode);
				243	} else {
				244	// bail out for c
				245	charCEs[i][0] = ce0 = Collation::NO_CE;
				246	charCEs[i][1] = ce1 = 0;
				247	}
				248	if(c == 0 && !isContractionCharCE(ce0)) {
				249	// Always map U+0000 to a contraction.
				250	// Write a contraction list with only a default value if there is no real contraction.
				251	U_ASSERT(contractionCEs.isEmpty());
				252	addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
				253	charCEs[0][0] = ((int64_t)Collation::NO_CE_PRIMARY << 32) \| CONTRACTION_FLAG;
				254	charCEs[0][1] = 0;
				255	}
				256	}
				257	// Terminate the last contraction list.
				258	contractionCEs.addElement(CollationFastLatin::CONTR_CHAR_MASK, errorCode);
				259	}
				260
				261	UBool
				262	CollationFastLatinBuilder::getCEsFromCE32(const CollationData &data, UChar32 c, uint32_t ce32,
				263	UErrorCode &errorCode) {
				264	if(U_FAILURE(errorCode)) { return FALSE; }
				265	ce32 = data.getFinalCE32(ce32);
				266	ce1 = 0;
				267	if(Collation::isSimpleOrLongCE32(ce32)) {
				268	ce0 = Collation::ceFromCE32(ce32);
				269	} else {
				270	switch(Collation::tagFromCE32(ce32)) {
				271	case Collation::LATIN_EXPANSION_TAG:
				272	ce0 = Collation::latinCE0FromCE32(ce32);
				273	ce1 = Collation::latinCE1FromCE32(ce32);
				274	break;
				275	case Collation::EXPANSION32_TAG: {
				276	const uint32_t *ce32s = data.ce32s + Collation::indexFromCE32(ce32);
				277	int32_t length = Collation::lengthFromCE32(ce32);
				278	if(length <= 2) {
				279	ce0 = Collation::ceFromCE32(ce32s[0]);
				280	if(length == 2) {
				281	ce1 = Collation::ceFromCE32(ce32s[1]);
				282	}
				283	break;
				284	} else {
				285	return FALSE;
				286	}
				287	}
				288	case Collation::EXPANSION_TAG: {
				289	const int64_t *ces = data.ces + Collation::indexFromCE32(ce32);
				290	int32_t length = Collation::lengthFromCE32(ce32);
				291	if(length <= 2) {
				292	ce0 = ces[0];
				293	if(length == 2) {
				294	ce1 = ces[1];
				295	}
				296	break;
				297	} else {
				298	return FALSE;
				299	}
				300	}
				301	// Note: We could support PREFIX_TAG (assert c>=0)
				302	// by recursing on its default CE32 and checking that none of the prefixes starts
				303	// with a fast Latin character.
				304	// However, currently (2013) there are only the L-before-middle-dot
				305	// prefix mappings in the Latin range, and those would be rejected anyway.
				306	case Collation::CONTRACTION_TAG:
				307	U_ASSERT(c >= 0);
				308	return getCEsFromContractionCE32(data, ce32, errorCode);
				309	case Collation::OFFSET_TAG:
				310	U_ASSERT(c >= 0);
				311	ce0 = data.getCEFromOffsetCE32(c, ce32);
				312	break;
				313	default:
				314	return FALSE;
				315	}
				316	}
				317	// A mapping can be completely ignorable.
				318	if(ce0 == 0) { return ce1 == 0; }
				319	// We do not support an ignorable ce0 unless it is completely ignorable.
				320	uint32_t p0 = (uint32_t)(ce0 >> 32);
				321	if(p0 == 0) { return FALSE; }
				322	// We only support primaries up to the Latin script.
				323	if(p0 > lastLatinPrimary) { return FALSE; }
				324	// We support non-common secondary and case weights only together with short primaries.
				325	uint32_t lower32_0 = (uint32_t)ce0;
				326	if(p0 < firstShortPrimary) {
				327	uint32_t sc0 = lower32_0 & Collation::SECONDARY_AND_CASE_MASK;
				328	if(sc0 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
				329	}
				330	// No below-common tertiary weights.
				331	if((lower32_0 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
				332	if(ce1 != 0) {
				333	// Both primaries must be in the same group,
				334	// or both must get short mini primaries,
				335	// or a short-primary CE is followed by a secondary CE.
				336	// This is so that we can test the first primary and use the same mask for both,
				337	// and determine for both whether they are variable.
				338	uint32_t p1 = (uint32_t)(ce1 >> 32);
				339	if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return FALSE; }
				340	uint32_t lower32_1 = (uint32_t)ce1;
				341	// No tertiary CEs.
				342	if((lower32_1 >> 16) == 0) { return FALSE; }
				343	// We support non-common secondary and case weights
				344	// only for secondary CEs or together with short primaries.
				345	if(p1 != 0 && p1 < firstShortPrimary) {
				346	uint32_t sc1 = lower32_1 & Collation::SECONDARY_AND_CASE_MASK;
				347	if(sc1 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
				348	}
				349	// No below-common tertiary weights.
				350	if((lower32_1 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
				351	}
				352	// No quaternary weights.
				353	if(((ce0 \| ce1) & Collation::QUATERNARY_MASK) != 0) { return FALSE; }
				354	return TRUE;
				355	}
				356
				357	UBool
				358	CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData &data, uint32_t ce32,
				359	UErrorCode &errorCode) {
				360	if(U_FAILURE(errorCode)) { return FALSE; }
				361	const UChar *p = data.contexts + Collation::indexFromCE32(ce32);
				362	ce32 = CollationData::readCE32(p); // Default if no suffix match.
				363	// Since the original ce32 is not a prefix mapping,
				364	// the default ce32 must not be another contraction.
				365	U_ASSERT(!Collation::isContractionCE32(ce32));
				366	int32_t contractionIndex = contractionCEs.size();
				367	if(getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
				368	addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
				369	} else {
				370	// Bail out for c-without-contraction.
				371	addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, Collation::NO_CE, 0, errorCode);
				372	}
				373	// Handle an encodable contraction unless the next contraction is too long
				374	// and starts with the same character.
				375	int32_t prevX = -1;
				376	UBool addContraction = FALSE;
				377	UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
				378	while(suffixes.next(errorCode)) {
				379	const UnicodeString &suffix = suffixes.getString();
				380	int32_t x = CollationFastLatin::getCharIndex(suffix.charAt(0));
				381	if(x < 0) { continue; } // ignore anything but fast Latin text
				382	if(x == prevX) {
				383	if(addContraction) {
				384	// Bail out for all contractions starting with this character.
				385	addContractionEntry(x, Collation::NO_CE, 0, errorCode);
				386	addContraction = FALSE;
				387	}
				388	continue;
				389	}
				390	if(addContraction) {
				391	addContractionEntry(prevX, ce0, ce1, errorCode);
				392	}
				393	ce32 = (uint32_t)suffixes.getValue();
				394	if(suffix.length() == 1 && getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
				395	addContraction = TRUE;
				396	} else {
				397	addContractionEntry(x, Collation::NO_CE, 0, errorCode);
				398	addContraction = FALSE;
				399	}
				400	prevX = x;
				401	}
				402	if(addContraction) {
				403	addContractionEntry(prevX, ce0, ce1, errorCode);
				404	}
				405	if(U_FAILURE(errorCode)) { return FALSE; }
				406	// Note: There might not be any fast Latin contractions, but
				407	// we need to enter contraction handling anyway so that we can bail out
				408	// when there is a non-fast-Latin character following.
				409	// For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
				410	// following umlaut and bail out, rather than return the difference of Y vs. u.
				411	ce0 = ((int64_t)Collation::NO_CE_PRIMARY << 32) \| CONTRACTION_FLAG \| contractionIndex;
				412	ce1 = 0;
				413	return TRUE;
				414	}
				415
				416	void
				417	CollationFastLatinBuilder::addContractionEntry(int32_t x, int64_t cce0, int64_t cce1,
				418	UErrorCode &errorCode) {
				419	contractionCEs.addElement(x, errorCode);
				420	contractionCEs.addElement(cce0, errorCode);
				421	contractionCEs.addElement(cce1, errorCode);
				422	addUniqueCE(cce0, errorCode);
				423	addUniqueCE(cce1, errorCode);
				424	}
				425
				426	void
				427	CollationFastLatinBuilder::addUniqueCE(int64_t ce, UErrorCode &errorCode) {
				428	if(U_FAILURE(errorCode)) { return; }
				429	if(ce == 0 \|\| (uint32_t)(ce >> 32) == Collation::NO_CE_PRIMARY) { return; }
				430	ce &= ~(int64_t)Collation::CASE_MASK; // blank out case bits
				431	int32_t i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
				432	if(i < 0) {
				433	uniqueCEs.insertElementAt(ce, ~i, errorCode);
				434	}
				435	}
				436
				437	uint32_t
				438	CollationFastLatinBuilder::getMiniCE(int64_t ce) const {
				439	ce &= ~(int64_t)Collation::CASE_MASK; // blank out case bits
				440	int32_t index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
				441	U_ASSERT(index >= 0);
				442	return miniCEs[index];
				443	}
				444
				445	UBool
				446	CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
				447	if(U_FAILURE(errorCode)) { return FALSE; }
				448	uprv_free(miniCEs);
				449	miniCEs = (uint16_t )uprv_malloc(uniqueCEs.size() 2);
				450	if(miniCEs == NULL) {
				451	errorCode = U_MEMORY_ALLOCATION_ERROR;
				452	return FALSE;
				453	}
				454	int32_t group = 1;
				455	uint32_t lastGroupByte = result[group];
				456	// The lowest unique CE must be at least a secondary CE.
				457	U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0);
				458	uint32_t prevPrimary = 0;
				459	uint32_t prevSecondary = 0;
				460	uint32_t pri = 0;
				461	uint32_t sec = 0;
				462	uint32_t ter = CollationFastLatin::COMMON_TER;
				463	for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
				464	int64_t ce = uniqueCEs.elementAti(i);
				465	// Note: At least one of the p/s/t weights changes from one unique CE to the next.
				466	// (uniqueCEs does not store case bits.)
				467	uint32_t p = (uint32_t)(ce >> 32);
				468	if(p != prevPrimary) {
				469	uint32_t p1 = p >> 24;
				470	while(p1 > lastGroupByte) {
				471	U_ASSERT(pri <= CollationFastLatin::MAX_LONG);
				472	// Add the last "long primary" in or before the group
				473	// into the upper 9 bits of the group entry.
				474	result.setCharAt(group, (UChar)((pri << 4) \| lastGroupByte));
				475	if(++group < headerLength) { // group is 1-based
				476	lastGroupByte = result[group];
				477	} else {
				478	lastGroupByte = 0xff;
				479	break;
				480	}
				481	}
				482	if(p < firstShortPrimary) {
				483	if(pri == 0) {
				484	pri = CollationFastLatin::MIN_LONG;
				485	} else if(pri < CollationFastLatin::MAX_LONG) {
				486	pri += CollationFastLatin::LONG_INC;
				487	} else {
				488	#if DEBUG_COLLATION_FAST_LATIN_BUILDER
				489	printf("long-primary overflow for %08x\n", p);
				490	#endif
				491	miniCEs[i] = CollationFastLatin::BAIL_OUT;
				492	continue;
				493	}
				494	} else {
				495	if(pri < CollationFastLatin::MIN_SHORT) {
				496	pri = CollationFastLatin::MIN_SHORT;
				497	} else if(pri < (CollationFastLatin::MAX_SHORT - CollationFastLatin::SHORT_INC)) {
				498	// Reserve the highest primary weight for U+FFFF.
				499	pri += CollationFastLatin::SHORT_INC;
				500	} else {
				501	#if DEBUG_COLLATION_FAST_LATIN_BUILDER
				502	printf("short-primary overflow for %08x\n", p);
				503	#endif
				504	shortPrimaryOverflow = TRUE;
				505	miniCEs[i] = CollationFastLatin::BAIL_OUT;
				506	continue;
				507	}
				508	}
				509	prevPrimary = p;
				510	prevSecondary = Collation::COMMON_WEIGHT16;
				511	sec = CollationFastLatin::COMMON_SEC;
				512	ter = CollationFastLatin::COMMON_TER;
				513	}
				514	uint32_t lower32 = (uint32_t)ce;
				515	uint32_t s = lower32 >> 16;
				516	if(s != prevSecondary) {
				517	if(pri == 0) {
				518	if(sec == 0) {
				519	sec = CollationFastLatin::MIN_SEC_HIGH;
				520	} else if(sec < CollationFastLatin::MAX_SEC_HIGH) {
				521	sec += CollationFastLatin::SEC_INC;
				522	} else {
				523	miniCEs[i] = CollationFastLatin::BAIL_OUT;
				524	continue;
				525	}
				526	prevSecondary = s;
				527	ter = CollationFastLatin::COMMON_TER;
				528	} else if(s < Collation::COMMON_WEIGHT16) {
				529	if(sec == CollationFastLatin::COMMON_SEC) {
				530	sec = CollationFastLatin::MIN_SEC_BEFORE;
				531	} else if(sec < CollationFastLatin::MAX_SEC_BEFORE) {
				532	sec += CollationFastLatin::SEC_INC;
				533	} else {
				534	miniCEs[i] = CollationFastLatin::BAIL_OUT;
				535	continue;
				536	}
				537	} else if(s == Collation::COMMON_WEIGHT16) {
				538	sec = CollationFastLatin::COMMON_SEC;
				539	} else {
				540	if(sec < CollationFastLatin::MIN_SEC_AFTER) {
				541	sec = CollationFastLatin::MIN_SEC_AFTER;
				542	} else if(sec < CollationFastLatin::MAX_SEC_AFTER) {
				543	sec += CollationFastLatin::SEC_INC;
				544	} else {
				545	miniCEs[i] = CollationFastLatin::BAIL_OUT;
				546	continue;
				547	}
				548	}
				549	prevSecondary = s;
				550	ter = CollationFastLatin::COMMON_TER;
				551	}
				552	U_ASSERT((lower32 & Collation::CASE_MASK) == 0); // blanked out in uniqueCEs
				553	uint32_t t = lower32 & Collation::ONLY_TERTIARY_MASK;
				554	if(t > Collation::COMMON_WEIGHT16) {
				555	if(ter < CollationFastLatin::MAX_TER_AFTER) {
				556	++ter;
				557	} else {
				558	miniCEs[i] = CollationFastLatin::BAIL_OUT;
				559	continue;
				560	}
				561	}
				562	if(CollationFastLatin::MIN_LONG <= pri && pri <= CollationFastLatin::MAX_LONG) {
				563	U_ASSERT(sec == CollationFastLatin::COMMON_SEC);
				564	miniCEs[i] = (uint16_t)(pri \| ter);
				565	} else {
				566	miniCEs[i] = (uint16_t)(pri \| sec \| ter);
				567	}
				568	}
				569	#if DEBUG_COLLATION_FAST_LATIN_BUILDER
				570	printf("last mini primary: %04x\n", pri);
				571	#endif
				572	#if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
				573	for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
				574	int64_t ce = uniqueCEs.elementAti(i);
				575	printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
				576	}
				577	#endif
				578	return U_SUCCESS(errorCode);
				579	}
				580
				581	UBool
				582	CollationFastLatinBuilder::encodeCharCEs(UErrorCode &errorCode) {
				583	if(U_FAILURE(errorCode)) { return FALSE; }
				584	int32_t miniCEsStart = result.length();
				585	for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
				586	result.append(0); // initialize to completely ignorable
				587	}
				588	int32_t indexBase = result.length();
				589	for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
				590	int64_t ce = charCEs[i][0];
				591	if(isContractionCharCE(ce)) { continue; } // defer contraction
				592	uint32_t miniCE = encodeTwoCEs(ce, charCEs[i][1]);
				593	if(miniCE > 0xffff) {
				594	// Note: There is a chance that this new expansion is the same as a previous one,
				595	// and if so, then we could reuse the other expansion.
				596	// However, that seems unlikely.
				597	int32_t expansionIndex = result.length() - indexBase;
				598	if(expansionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
				599	miniCE = CollationFastLatin::BAIL_OUT;
				600	} else {
				601	result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
				602	miniCE = CollationFastLatin::EXPANSION \| expansionIndex;
				603	}
				604	}
				605	result.setCharAt(miniCEsStart + i, (UChar)miniCE);
				606	}
				607	return U_SUCCESS(errorCode);
				608	}
				609
				610	UBool
				611	CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) {
				612	// We encode all contraction lists so that the first word of a list
				613	// terminates the previous list, and we only need one additional terminator at the end.
				614	if(U_FAILURE(errorCode)) { return FALSE; }
				615	int32_t indexBase = headerLength + CollationFastLatin::NUM_FAST_CHARS;
				616	int32_t firstContractionIndex = result.length();
				617	for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
				618	int64_t ce = charCEs[i][0];
				619	if(!isContractionCharCE(ce)) { continue; }
				620	int32_t contractionIndex = result.length() - indexBase;
				621	if(contractionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
				622	result.setCharAt(headerLength + i, CollationFastLatin::BAIL_OUT);
				623	continue;
				624	}
				625	UBool firstTriple = TRUE;
				626	for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) {
				627	int32_t x = contractionCEs.elementAti(index);
				628	if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; }
				629	int64_t cce0 = contractionCEs.elementAti(index + 1);
				630	int64_t cce1 = contractionCEs.elementAti(index + 2);
				631	uint32_t miniCE = encodeTwoCEs(cce0, cce1);
				632	if(miniCE == CollationFastLatin::BAIL_OUT) {
				633	result.append((UChar)(x \| (1 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
				634	} else if(miniCE <= 0xffff) {
				635	result.append((UChar)(x \| (2 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
				636	result.append((UChar)miniCE);
				637	} else {
				638	result.append((UChar)(x \| (3 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
				639	result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
				640	}
				641	firstTriple = FALSE;
				642	}
				643	// Note: There is a chance that this new contraction list is the same as a previous one,
				644	// and if so, then we could truncate the result and reuse the other list.
				645	// However, that seems unlikely.
				646	result.setCharAt(headerLength + i,
				647	(UChar)(CollationFastLatin::CONTRACTION \| contractionIndex));
				648	}
				649	if(result.length() > firstContractionIndex) {
				650	// Terminate the last contraction list.
				651	result.append((UChar)CollationFastLatin::CONTR_CHAR_MASK);
				652	}
				653	if(result.isBogus()) {
				654	errorCode = U_MEMORY_ALLOCATION_ERROR;
				655	return FALSE;
				656	}
				657	#if DEBUG_COLLATION_FAST_LATIN_BUILDER
				658	printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
				659	puts(" header & below-digit groups map");
				660	int32_t i = 0;
				661	for(; i < headerLength; ++i) {
				662	printf(" %04x", result[i]);
				663	}
				664	printf("\n char mini CEs");
				665	U_ASSERT(CollationFastLatin::NUM_FAST_CHARS % 16 == 0);
				666	for(; i < indexBase; i += 16) {
				667	UChar32 c = i - headerLength;
				668	if(c >= CollationFastLatin::LATIN_LIMIT) {
				669	c = CollationFastLatin::PUNCT_START + c - CollationFastLatin::LATIN_LIMIT;
				670	}
				671	printf("\n %04x:", c);
				672	for(int32_t j = 0; j < 16; ++j) {
				673	printf(" %04x", result[i + j]);
				674	}
				675	}
				676	printf("\n expansions & contractions");
				677	for(; i < result.length(); ++i) {
				678	if((i - indexBase) % 16 == 0) { puts(""); }
				679	printf(" %04x", result[i]);
				680	}
				681	puts("");
				682	#endif
				683	return TRUE;
				684	}
				685
				686	uint32_t
				687	CollationFastLatinBuilder::encodeTwoCEs(int64_t first, int64_t second) const {
				688	if(first == 0) {
				689	return 0; // completely ignorable
				690	}
				691	if(first == Collation::NO_CE) {
				692	return CollationFastLatin::BAIL_OUT;
				693	}
				694	U_ASSERT((uint32_t)(first >> 32) != Collation::NO_CE_PRIMARY);
				695
				696	uint32_t miniCE = getMiniCE(first);
				697	if(miniCE == CollationFastLatin::BAIL_OUT) { return miniCE; }
				698	if(miniCE >= CollationFastLatin::MIN_SHORT) {
				699	// Extract & copy the case bits.
				700	// Shift them from normal CE bits 15..14 to mini CE bits 4..3.
				701	uint32_t c = (((uint32_t)first & Collation::CASE_MASK) >> (14 - 3));
				702	// Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
				703	c += CollationFastLatin::LOWER_CASE;
				704	miniCE \|= c;
				705	}
				706	if(second == 0) { return miniCE; }
				707
				708	uint32_t miniCE1 = getMiniCE(second);
				709	if(miniCE1 == CollationFastLatin::BAIL_OUT) { return miniCE1; }
				710
				711	uint32_t case1 = (uint32_t)second & Collation::CASE_MASK;
				712	if(miniCE >= CollationFastLatin::MIN_SHORT &&
				713	(miniCE & CollationFastLatin::SECONDARY_MASK) == CollationFastLatin::COMMON_SEC) {
				714	// Try to combine the two mini CEs into one.
				715	uint32_t sec1 = miniCE1 & CollationFastLatin::SECONDARY_MASK;
				716	uint32_t ter1 = miniCE1 & CollationFastLatin::TERTIARY_MASK;
				717	if(sec1 >= CollationFastLatin::MIN_SEC_HIGH && case1 == 0 &&
				718	ter1 == CollationFastLatin::COMMON_TER) {
				719	// sec1>=sec_high implies pri1==0.
				720	return (miniCE & ~CollationFastLatin::SECONDARY_MASK) \| sec1;
				721	}
				722	}
				723
				724	if(miniCE1 <= CollationFastLatin::SECONDARY_MASK \|\| CollationFastLatin::MIN_SHORT <= miniCE1) {
				725	// Secondary CE, or a CE with a short primary, copy the case bits.
				726	case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE;
				727	miniCE1 \|= case1;
				728	}
				729	return (miniCE << 16) \| miniCE1;
				730	}
				731
				732	U_NAMESPACE_END
				733
				734	#endif // !UCONFIG_NO_COLLATION