Blame - source/common/normalizer2impl.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 9c00c1c818d12aff921fe62a82a14b6517ff527b [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame^]	1	/*
				2	*******************************************************************************
				3	*
				4	* Copyright (C) 2009-2013, International Business Machines
				5	* Corporation and others. All Rights Reserved.
				6	*
				7	*******************************************************************************
				8	* file name: normalizer2impl.cpp
				9	* encoding: US-ASCII
				10	* tab size: 8 (not used)
				11	* indentation:4
				12	*
				13	* created on: 2009nov22
				14	* created by: Markus W. Scherer
				15	*/
				16
				17	#include "unicode/utypes.h"
				18
				19	#if !UCONFIG_NO_NORMALIZATION
				20
				21	#include "unicode/normalizer2.h"
				22	#include "unicode/udata.h"
				23	#include "unicode/ustring.h"
				24	#include "unicode/utf16.h"
				25	#include "cmemory.h"
				26	#include "mutex.h"
				27	#include "normalizer2impl.h"
				28	#include "putilimp.h"
				29	#include "uassert.h"
				30	#include "uset_imp.h"
				31	#include "utrie2.h"
				32	#include "uvector.h"
				33
				34	U_NAMESPACE_BEGIN
				35
				36	// ReorderingBuffer -------------------------------------------------------- ***
				37
				38	UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
				39	int32_t length=str.length();
				40	start=str.getBuffer(destCapacity);
				41	if(start==NULL) {
				42	// getBuffer() already did str.setToBogus()
				43	errorCode=U_MEMORY_ALLOCATION_ERROR;
				44	return FALSE;
				45	}
				46	limit=start+length;
				47	remainingCapacity=str.getCapacity()-length;
				48	reorderStart=start;
				49	if(start==limit) {
				50	lastCC=0;
				51	} else {
				52	setIterator();
				53	lastCC=previousCC();
				54	// Set reorderStart after the last code point with cc<=1 if there is one.
				55	if(lastCC>1) {
				56	while(previousCC()>1) {}
				57	}
				58	reorderStart=codePointLimit;
				59	}
				60	return TRUE;
				61	}
				62
				63	UBool ReorderingBuffer::equals(const UChar otherStart, const UChar otherLimit) const {
				64	int32_t length=(int32_t)(limit-start);
				65	return
				66	length==(int32_t)(otherLimit-otherStart) &&
				67	0==u_memcmp(start, otherStart, length);
				68	}
				69
				70	UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
				71	if(remainingCapacity<2 && !resize(2, errorCode)) {
				72	return FALSE;
				73	}
				74	if(lastCC<=cc \|\| cc==0) {
				75	limit[0]=U16_LEAD(c);
				76	limit[1]=U16_TRAIL(c);
				77	limit+=2;
				78	lastCC=cc;
				79	if(cc<=1) {
				80	reorderStart=limit;
				81	}
				82	} else {
				83	insert(c, cc);
				84	}
				85	remainingCapacity-=2;
				86	return TRUE;
				87	}
				88
				89	UBool ReorderingBuffer::append(const UChar *s, int32_t length,
				90	uint8_t leadCC, uint8_t trailCC,
				91	UErrorCode &errorCode) {
				92	if(length==0) {
				93	return TRUE;
				94	}
				95	if(remainingCapacity<length && !resize(length, errorCode)) {
				96	return FALSE;
				97	}
				98	remainingCapacity-=length;
				99	if(lastCC<=leadCC \|\| leadCC==0) {
				100	if(trailCC<=1) {
				101	reorderStart=limit+length;
				102	} else if(leadCC<=1) {
				103	reorderStart=limit+1; // Ok if not a code point boundary.
				104	}
				105	const UChar *sLimit=s+length;
				106	do { limit++=s++; } while(s!=sLimit);
				107	lastCC=trailCC;
				108	} else {
				109	int32_t i=0;
				110	UChar32 c;
				111	U16_NEXT(s, i, length, c);
				112	insert(c, leadCC); // insert first code point
				113	while(i<length) {
				114	U16_NEXT(s, i, length, c);
				115	if(i<length) {
				116	// s must be in NFD, otherwise we need to use getCC().
				117	leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
				118	} else {
				119	leadCC=trailCC;
				120	}
				121	append(c, leadCC, errorCode);
				122	}
				123	}
				124	return TRUE;
				125	}
				126
				127	UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
				128	int32_t cpLength=U16_LENGTH(c);
				129	if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
				130	return FALSE;
				131	}
				132	remainingCapacity-=cpLength;
				133	if(cpLength==1) {
				134	*limit++=(UChar)c;
				135	} else {
				136	limit[0]=U16_LEAD(c);
				137	limit[1]=U16_TRAIL(c);
				138	limit+=2;
				139	}
				140	lastCC=0;
				141	reorderStart=limit;
				142	return TRUE;
				143	}
				144
				145	UBool ReorderingBuffer::appendZeroCC(const UChar s, const UChar sLimit, UErrorCode &errorCode) {
				146	if(s==sLimit) {
				147	return TRUE;
				148	}
				149	int32_t length=(int32_t)(sLimit-s);
				150	if(remainingCapacity<length && !resize(length, errorCode)) {
				151	return FALSE;
				152	}
				153	u_memcpy(limit, s, length);
				154	limit+=length;
				155	remainingCapacity-=length;
				156	lastCC=0;
				157	reorderStart=limit;
				158	return TRUE;
				159	}
				160
				161	void ReorderingBuffer::remove() {
				162	reorderStart=limit=start;
				163	remainingCapacity=str.getCapacity();
				164	lastCC=0;
				165	}
				166
				167	void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
				168	if(suffixLength<(limit-start)) {
				169	limit-=suffixLength;
				170	remainingCapacity+=suffixLength;
				171	} else {
				172	limit=start;
				173	remainingCapacity=str.getCapacity();
				174	}
				175	lastCC=0;
				176	reorderStart=limit;
				177	}
				178
				179	UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
				180	int32_t reorderStartIndex=(int32_t)(reorderStart-start);
				181	int32_t length=(int32_t)(limit-start);
				182	str.releaseBuffer(length);
				183	int32_t newCapacity=length+appendLength;
				184	int32_t doubleCapacity=2*str.getCapacity();
				185	if(newCapacity<doubleCapacity) {
				186	newCapacity=doubleCapacity;
				187	}
				188	if(newCapacity<256) {
				189	newCapacity=256;
				190	}
				191	start=str.getBuffer(newCapacity);
				192	if(start==NULL) {
				193	// getBuffer() already did str.setToBogus()
				194	errorCode=U_MEMORY_ALLOCATION_ERROR;
				195	return FALSE;
				196	}
				197	reorderStart=start+reorderStartIndex;
				198	limit=start+length;
				199	remainingCapacity=str.getCapacity()-length;
				200	return TRUE;
				201	}
				202
				203	void ReorderingBuffer::skipPrevious() {
				204	codePointLimit=codePointStart;
				205	UChar c=*--codePointStart;
				206	if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
				207	--codePointStart;
				208	}
				209	}
				210
				211	uint8_t ReorderingBuffer::previousCC() {
				212	codePointLimit=codePointStart;
				213	if(reorderStart>=codePointStart) {
				214	return 0;
				215	}
				216	UChar32 c=*--codePointStart;
				217	if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
				218	return 0;
				219	}
				220
				221	UChar c2;
				222	if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
				223	--codePointStart;
				224	c=U16_GET_SUPPLEMENTARY(c2, c);
				225	}
				226	return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
				227	}
				228
				229	// Inserts c somewhere before the last character.
				230	// Requires 0<cc<lastCC which implies reorderStart<limit.
				231	void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
				232	for(setIterator(), skipPrevious(); previousCC()>cc;) {}
				233	// insert c at codePointLimit, after the character with prevCC<=cc
				234	UChar *q=limit;
				235	UChar *r=limit+=U16_LENGTH(c);
				236	do {
				237	--r=--q;
				238	} while(codePointLimit!=q);
				239	writeCodePoint(q, c);
				240	if(cc<=1) {
				241	reorderStart=r;
				242	}
				243	}
				244
				245	// Normalizer2Impl --------------------------------------------------------- ***
				246
				247	struct CanonIterData : public UMemory {
				248	CanonIterData(UErrorCode &errorCode);
				249	~CanonIterData();
				250	void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
				251	UTrie2 *trie;
				252	UVector canonStartSets; // contains UnicodeSet *
				253	};
				254
				255	Normalizer2Impl::~Normalizer2Impl() {
				256	udata_close(memory);
				257	utrie2_close(normTrie);
				258	delete fCanonIterData;
				259	}
				260
				261	UBool U_CALLCONV
				262	Normalizer2Impl::isAcceptable(void *context,
				263	const char * /* type /, const char /name/,
				264	const UDataInfo *pInfo) {
				265	if(
				266	pInfo->size>=20 &&
				267	pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
				268	pInfo->charsetFamily==U_CHARSET_FAMILY &&
				269	pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
				270	pInfo->dataFormat[1]==0x72 &&
				271	pInfo->dataFormat[2]==0x6d &&
				272	pInfo->dataFormat[3]==0x32 &&
				273	pInfo->formatVersion[0]==2
				274	) {
				275	Normalizer2Impl me=(Normalizer2Impl )context;
				276	uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
				277	return TRUE;
				278	} else {
				279	return FALSE;
				280	}
				281	}
				282
				283	void
				284	Normalizer2Impl::load(const char packageName, const char name, UErrorCode &errorCode) {
				285	if(U_FAILURE(errorCode)) {
				286	return;
				287	}
				288	memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
				289	if(U_FAILURE(errorCode)) {
				290	return;
				291	}
				292	const uint8_t inBytes=(const uint8_t )udata_getMemory(memory);
				293	const int32_t inIndexes=(const int32_t )inBytes;
				294	int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
				295	if(indexesLength<=IX_MIN_MAYBE_YES) {
				296	errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
				297	return;
				298	}
				299
				300	minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
				301	minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
				302
				303	minYesNo=inIndexes[IX_MIN_YES_NO];
				304	minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
				305	minNoNo=inIndexes[IX_MIN_NO_NO];
				306	limitNoNo=inIndexes[IX_LIMIT_NO_NO];
				307	minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
				308
				309	int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
				310	int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
				311	normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
				312	inBytes+offset, nextOffset-offset, NULL,
				313	&errorCode);
				314	if(U_FAILURE(errorCode)) {
				315	return;
				316	}
				317
				318	offset=nextOffset;
				319	nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
				320	maybeYesCompositions=(const uint16_t *)(inBytes+offset);
				321	extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
				322
				323	// smallFCD: new in formatVersion 2
				324	offset=nextOffset;
				325	smallFCD=inBytes+offset;
				326
				327	// Build tccc180[].
				328	// gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
				329	uint8_t bits=0;
				330	for(UChar c=0; c<0x180; bits>>=1) {
				331	if((c&0xff)==0) {
				332	bits=smallFCD[c>>8]; // one byte per 0x100 code points
				333	}
				334	if(bits&1) {
				335	for(int i=0; i<0x20; ++i, ++c) {
				336	tccc180[c]=(uint8_t)getFCD16FromNormData(c);
				337	}
				338	} else {
				339	uprv_memset(tccc180+c, 0, 0x20);
				340	c+=0x20;
				341	}
				342	}
				343	}
				344
				345	uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar cpStart, const UChar cpLimit) const {
				346	UChar32 c;
				347	if(cpStart==(cpLimit-1)) {
				348	c=*cpStart;
				349	} else {
				350	c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
				351	}
				352	uint16_t prevNorm16=getNorm16(c);
				353	if(prevNorm16<=minYesNo) {
				354	return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
				355	} else {
				356	return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
				357	}
				358	}
				359
				360	U_CDECL_BEGIN
				361
				362	static UBool U_CALLCONV
				363	enumPropertyStartsRange(const void context, UChar32 start, UChar32 /end/, uint32_t /value*/) {
				364	/* add the start code point to the USet */
				365	const USetAdder sa=(const USetAdder )context;
				366	sa->add(sa->set, start);
				367	return TRUE;
				368	}
				369
				370	static uint32_t U_CALLCONV
				371	segmentStarterMapper(const void * /context/, uint32_t value) {
				372	return value&CANON_NOT_SEGMENT_STARTER;
				373	}
				374
				375	U_CDECL_END
				376
				377	void
				378	Normalizer2Impl::addPropertyStarts(const USetAdder sa, UErrorCode & /errorCode*/) const {
				379	/* add the start code point of each same-value range of each trie */
				380	utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
				381
				382	/* add Hangul LV syllables and LV+1 because of skippables */
				383	for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
				384	sa->add(sa->set, c);
				385	sa->add(sa->set, c+1);
				386	}
				387	sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
				388	}
				389
				390	void
				391	Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
				392	/* add the start code point of each same-value range of the canonical iterator data trie */
				393	if(ensureCanonIterData(errorCode)) {
				394	// currently only used for the SEGMENT_STARTER property
				395	utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
				396	}
				397	}
				398
				399	const UChar *
				400	Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
				401	UChar32 minNeedDataCP,
				402	ReorderingBuffer *buffer,
				403	UErrorCode &errorCode) const {
				404	// Make some effort to support NUL-terminated strings reasonably.
				405	// Take the part of the fast quick check loop that does not look up
				406	// data and check the first part of the string.
				407	// After this prefix, determine the string length to simplify the rest
				408	// of the code.
				409	const UChar *prevSrc=src;
				410	UChar c;
				411	while((c=*src++)<minNeedDataCP && c!=0) {}
				412	// Back out the last character for full processing.
				413	// Copy this prefix.
				414	if(--src!=prevSrc) {
				415	if(buffer!=NULL) {
				416	buffer->appendZeroCC(prevSrc, src, errorCode);
				417	}
				418	}
				419	return src;
				420	}
				421
				422	// Dual functionality:
				423	// buffer!=NULL: normalize
				424	// buffer==NULL: isNormalized/spanQuickCheckYes
				425	const UChar *
				426	Normalizer2Impl::decompose(const UChar src, const UChar limit,
				427	ReorderingBuffer *buffer,
				428	UErrorCode &errorCode) const {
				429	UChar32 minNoCP=minDecompNoCP;
				430	if(limit==NULL) {
				431	src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
				432	if(U_FAILURE(errorCode)) {
				433	return src;
				434	}
				435	limit=u_strchr(src, 0);
				436	}
				437
				438	const UChar *prevSrc;
				439	UChar32 c=0;
				440	uint16_t norm16=0;
				441
				442	// only for quick check
				443	const UChar *prevBoundary=src;
				444	uint8_t prevCC=0;
				445
				446	for(;;) {
				447	// count code units below the minimum or with irrelevant data for the quick check
				448	for(prevSrc=src; src!=limit;) {
				449	if( (c=*src)<minNoCP \|\|
				450	isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
				451	) {
				452	++src;
				453	} else if(!U16_IS_SURROGATE(c)) {
				454	break;
				455	} else {
				456	UChar c2;
				457	if(U16_IS_SURROGATE_LEAD(c)) {
				458	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				459	c=U16_GET_SUPPLEMENTARY(c, c2);
				460	}
				461	} else /* trail surrogate */ {
				462	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				463	--src;
				464	c=U16_GET_SUPPLEMENTARY(c2, c);
				465	}
				466	}
				467	if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
				468	src+=U16_LENGTH(c);
				469	} else {
				470	break;
				471	}
				472	}
				473	}
				474	// copy these code units all at once
				475	if(src!=prevSrc) {
				476	if(buffer!=NULL) {
				477	if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
				478	break;
				479	}
				480	} else {
				481	prevCC=0;
				482	prevBoundary=src;
				483	}
				484	}
				485	if(src==limit) {
				486	break;
				487	}
				488
				489	// Check one above-minimum, relevant code point.
				490	src+=U16_LENGTH(c);
				491	if(buffer!=NULL) {
				492	if(!decompose(c, norm16, *buffer, errorCode)) {
				493	break;
				494	}
				495	} else {
				496	if(isDecompYes(norm16)) {
				497	uint8_t cc=getCCFromYesOrMaybe(norm16);
				498	if(prevCC<=cc \|\| cc==0) {
				499	prevCC=cc;
				500	if(cc<=1) {
				501	prevBoundary=src;
				502	}
				503	continue;
				504	}
				505	}
				506	return prevBoundary; // "no" or cc out of order
				507	}
				508	}
				509	return src;
				510	}
				511
				512	// Decompose a short piece of text which is likely to contain characters that
				513	// fail the quick check loop and/or where the quick check loop's overhead
				514	// is unlikely to be amortized.
				515	// Called by the compose() and makeFCD() implementations.
				516	UBool Normalizer2Impl::decomposeShort(const UChar src, const UChar limit,
				517	ReorderingBuffer &buffer,
				518	UErrorCode &errorCode) const {
				519	while(src<limit) {
				520	UChar32 c;
				521	uint16_t norm16;
				522	UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
				523	if(!decompose(c, norm16, buffer, errorCode)) {
				524	return FALSE;
				525	}
				526	}
				527	return TRUE;
				528	}
				529
				530	UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
				531	ReorderingBuffer &buffer,
				532	UErrorCode &errorCode) const {
				533	// Only loops for 1:1 algorithmic mappings.
				534	for(;;) {
				535	// get the decomposition and the lead and trail cc's
				536	if(isDecompYes(norm16)) {
				537	// c does not decompose
				538	return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
				539	} else if(isHangul(norm16)) {
				540	// Hangul syllable: decompose algorithmically
				541	UChar jamos[3];
				542	return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
				543	} else if(isDecompNoAlgorithmic(norm16)) {
				544	c=mapAlgorithmic(c, norm16);
				545	norm16=getNorm16(c);
				546	} else {
				547	// c decomposes, get everything from the variable-length extra data
				548	const uint16_t *mapping=getMapping(norm16);
				549	uint16_t firstUnit=*mapping;
				550	int32_t length=firstUnit&MAPPING_LENGTH_MASK;
				551	uint8_t leadCC, trailCC;
				552	trailCC=(uint8_t)(firstUnit>>8);
				553	if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
				554	leadCC=(uint8_t)(*(mapping-1)>>8);
				555	} else {
				556	leadCC=0;
				557	}
				558	return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
				559	}
				560	}
				561	}
				562
				563	const UChar *
				564	Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
				565	const UChar *decomp=NULL;
				566	uint16_t norm16;
				567	for(;;) {
				568	if(c<minDecompNoCP \|\| isDecompYes(norm16=getNorm16(c))) {
				569	// c does not decompose
				570	return decomp;
				571	} else if(isHangul(norm16)) {
				572	// Hangul syllable: decompose algorithmically
				573	length=Hangul::decompose(c, buffer);
				574	return buffer;
				575	} else if(isDecompNoAlgorithmic(norm16)) {
				576	c=mapAlgorithmic(c, norm16);
				577	decomp=buffer;
				578	length=0;
				579	U16_APPEND_UNSAFE(buffer, length, c);
				580	} else {
				581	// c decomposes, get everything from the variable-length extra data
				582	const uint16_t *mapping=getMapping(norm16);
				583	length=*mapping&MAPPING_LENGTH_MASK;
				584	return (const UChar *)mapping+1;
				585	}
				586	}
				587	}
				588
				589	// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
				590	// so that a raw mapping fits that consists of one unit ("rm0")
				591	// plus all but the first two code units of the normal mapping.
				592	// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
				593	const UChar *
				594	Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
				595	// We do not loop in this method because an algorithmic mapping itself
				596	// becomes a final result rather than having to be decomposed recursively.
				597	uint16_t norm16;
				598	if(c<minDecompNoCP \|\| isDecompYes(norm16=getNorm16(c))) {
				599	// c does not decompose
				600	return NULL;
				601	} else if(isHangul(norm16)) {
				602	// Hangul syllable: decompose algorithmically
				603	Hangul::getRawDecomposition(c, buffer);
				604	length=2;
				605	return buffer;
				606	} else if(isDecompNoAlgorithmic(norm16)) {
				607	c=mapAlgorithmic(c, norm16);
				608	length=0;
				609	U16_APPEND_UNSAFE(buffer, length, c);
				610	return buffer;
				611	} else {
				612	// c decomposes, get everything from the variable-length extra data
				613	const uint16_t *mapping=getMapping(norm16);
				614	uint16_t firstUnit=*mapping;
				615	int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
				616	if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
				617	// Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
				618	// Bit 7=MAPPING_HAS_CCC_LCCC_WORD
				619	const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
				620	uint16_t rm0=*rawMapping;
				621	if(rm0<=MAPPING_LENGTH_MASK) {
				622	length=rm0;
				623	return (const UChar *)rawMapping-rm0;
				624	} else {
				625	// Copy the normal mapping and replace its first two code units with rm0.
				626	buffer[0]=(UChar)rm0;
				627	u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
				628	length=mLength-1;
				629	return buffer;
				630	}
				631	} else {
				632	length=mLength;
				633	return (const UChar *)mapping+1;
				634	}
				635	}
				636	}
				637
				638	void Normalizer2Impl::decomposeAndAppend(const UChar src, const UChar limit,
				639	UBool doDecompose,
				640	UnicodeString &safeMiddle,
				641	ReorderingBuffer &buffer,
				642	UErrorCode &errorCode) const {
				643	buffer.copyReorderableSuffixTo(safeMiddle);
				644	if(doDecompose) {
				645	decompose(src, limit, &buffer, errorCode);
				646	return;
				647	}
				648	// Just merge the strings at the boundary.
				649	ForwardUTrie2StringIterator iter(normTrie, src, limit);
				650	uint8_t firstCC, prevCC, cc;
				651	firstCC=prevCC=cc=getCC(iter.next16());
				652	while(cc!=0) {
				653	prevCC=cc;
				654	cc=getCC(iter.next16());
				655	};
				656	if(limit==NULL) { // appendZeroCC() needs limit!=NULL
				657	limit=u_strchr(iter.codePointStart, 0);
				658	}
				659
				660	if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
				661	buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
				662	}
				663	}
				664
				665	// Note: hasDecompBoundary() could be implemented as aliases to
				666	// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
				667	// at the cost of building the FCD trie for a decomposition normalizer.
				668	UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
				669	for(;;) {
				670	if(c<minDecompNoCP) {
				671	return TRUE;
				672	}
				673	uint16_t norm16=getNorm16(c);
				674	if(isHangul(norm16) \|\| isDecompYesAndZeroCC(norm16)) {
				675	return TRUE;
				676	} else if(norm16>MIN_NORMAL_MAYBE_YES) {
				677	return FALSE; // ccc!=0
				678	} else if(isDecompNoAlgorithmic(norm16)) {
				679	c=mapAlgorithmic(c, norm16);
				680	} else {
				681	// c decomposes, get everything from the variable-length extra data
				682	const uint16_t *mapping=getMapping(norm16);
				683	uint16_t firstUnit=*mapping;
				684	if((firstUnit&MAPPING_LENGTH_MASK)==0) {
				685	return FALSE;
				686	}
				687	if(!before) {
				688	// decomp after-boundary: same as hasFCDBoundaryAfter(),
				689	// fcd16<=1 \|\| trailCC==0
				690	if(firstUnit>0x1ff) {
				691	return FALSE; // trailCC>1
				692	}
				693	if(firstUnit<=0xff) {
				694	return TRUE; // trailCC==0
				695	}
				696	// if(trailCC==1) test leadCC==0, same as checking for before-boundary
				697	}
				698	// TRUE if leadCC==0 (hasFCDBoundaryBefore())
				699	return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 \|\| (*(mapping-1)&0xff00)==0;
				700	}
				701	}
				702	}
				703
				704	/*
				705	* Finds the recomposition result for
				706	* a forward-combining "lead" character,
				707	* specified with a pointer to its compositions list,
				708	* and a backward-combining "trail" character.
				709	*
				710	* If the lead and trail characters combine, then this function returns
				711	* the following "compositeAndFwd" value:
				712	* Bits 21..1 composite character
				713	* Bit 0 set if the composite is a forward-combining starter
				714	* otherwise it returns -1.
				715	*
				716	* The compositions list has (trail, compositeAndFwd) pair entries,
				717	* encoded as either pairs or triples of 16-bit units.
				718	* The last entry has the high bit of its first unit set.
				719	*
				720	* The list is sorted by ascending trail characters (there are no duplicates).
				721	* A linear search is used.
				722	*
				723	* See normalizer2impl.h for a more detailed description
				724	* of the compositions list format.
				725	*/
				726	int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
				727	uint16_t key1, firstUnit;
				728	if(trail<COMP_1_TRAIL_LIMIT) {
				729	// trail character is 0..33FF
				730	// result entry may have 2 or 3 units
				731	key1=(uint16_t)(trail<<1);
				732	while(key1>(firstUnit=*list)) {
				733	list+=2+(firstUnit&COMP_1_TRIPLE);
				734	}
				735	if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
				736	if(firstUnit&COMP_1_TRIPLE) {
				737	return ((int32_t)list[1]<<16)\|list[2];
				738	} else {
				739	return list[1];
				740	}
				741	}
				742	} else {
				743	// trail character is 3400..10FFFF
				744	// result entry has 3 units
				745	key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
				746	(((trail>>COMP_1_TRAIL_SHIFT))&
				747	~COMP_1_TRIPLE));
				748	uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
				749	uint16_t secondUnit;
				750	for(;;) {
				751	if(key1>(firstUnit=*list)) {
				752	list+=2+(firstUnit&COMP_1_TRIPLE);
				753	} else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
				754	if(key2>(secondUnit=list[1])) {
				755	if(firstUnit&COMP_1_LAST_TUPLE) {
				756	break;
				757	} else {
				758	list+=3;
				759	}
				760	} else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
				761	return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)\|list[2];
				762	} else {
				763	break;
				764	}
				765	} else {
				766	break;
				767	}
				768	}
				769	}
				770	return -1;
				771	}
				772
				773	/**
				774	* @param list some character's compositions list
				775	* @param set recursively receives the composites from these compositions
				776	*/
				777	void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
				778	uint16_t firstUnit;
				779	int32_t compositeAndFwd;
				780	do {
				781	firstUnit=*list;
				782	if((firstUnit&COMP_1_TRIPLE)==0) {
				783	compositeAndFwd=list[1];
				784	list+=2;
				785	} else {
				786	compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)\|list[2];
				787	list+=3;
				788	}
				789	UChar32 composite=compositeAndFwd>>1;
				790	if((compositeAndFwd&1)!=0) {
				791	addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
				792	}
				793	set.add(composite);
				794	} while((firstUnit&COMP_1_LAST_TUPLE)==0);
				795	}
				796
				797	/*
				798	* Recomposes the buffer text starting at recomposeStartIndex
				799	* (which is in NFD - decomposed and canonically ordered),
				800	* and truncates the buffer contents.
				801	*
				802	* Note that recomposition never lengthens the text:
				803	* Any character consists of either one or two code units;
				804	* a composition may contain at most one more code unit than the original starter,
				805	* while the combining mark that is removed has at least one code unit.
				806	*/
				807	void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
				808	UBool onlyContiguous) const {
				809	UChar *p=buffer.getStart()+recomposeStartIndex;
				810	UChar *limit=buffer.getLimit();
				811	if(p==limit) {
				812	return;
				813	}
				814
				815	UChar starter, pRemove, q, r;
				816	const uint16_t *compositionsList;
				817	UChar32 c, compositeAndFwd;
				818	uint16_t norm16;
				819	uint8_t cc, prevCC;
				820	UBool starterIsSupplementary;
				821
				822	// Some of the following variables are not used until we have a forward-combining starter
				823	// and are only initialized now to avoid compiler warnings.
				824	compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
				825	starter=NULL;
				826	starterIsSupplementary=FALSE;
				827	prevCC=0;
				828
				829	for(;;) {
				830	UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
				831	cc=getCCFromYesOrMaybe(norm16);
				832	if( // this character combines backward and
				833	isMaybe(norm16) &&
				834	// we have seen a starter that combines forward and
				835	compositionsList!=NULL &&
				836	// the backward-combining character is not blocked
				837	(prevCC<cc \|\| prevCC==0)
				838	) {
				839	if(isJamoVT(norm16)) {
				840	// c is a Jamo V/T, see if we can compose it with the previous character.
				841	if(c<Hangul::JAMO_T_BASE) {
				842	// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
				843	UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
				844	if(prev<Hangul::JAMO_L_COUNT) {
				845	pRemove=p-1;
				846	UChar syllable=(UChar)
				847	(Hangul::HANGUL_BASE+
				848	(prevHangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))
				849	Hangul::JAMO_T_COUNT);
				850	UChar t;
				851	if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
				852	++p;
				853	syllable+=t; // The next character was a Jamo T.
				854	}
				855	*starter=syllable;
				856	// remove the Jamo V/T
				857	q=pRemove;
				858	r=p;
				859	while(r<limit) {
				860	q++=r++;
				861	}
				862	limit=q;
				863	p=pRemove;
				864	}
				865	}
				866	/*
				867	* No "else" for Jamo T:
				868	* Since the input is in NFD, there are no Hangul LV syllables that
				869	* a Jamo T could combine with.
				870	* All Jamo Ts are combined above when handling Jamo Vs.
				871	*/
				872	if(p==limit) {
				873	break;
				874	}
				875	compositionsList=NULL;
				876	continue;
				877	} else if((compositeAndFwd=combine(compositionsList, c))>=0) {
				878	// The starter and the combining mark (c) do combine.
				879	UChar32 composite=compositeAndFwd>>1;
				880
				881	// Replace the starter with the composite, remove the combining mark.
				882	pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
				883	if(starterIsSupplementary) {
				884	if(U_IS_SUPPLEMENTARY(composite)) {
				885	// both are supplementary
				886	starter[0]=U16_LEAD(composite);
				887	starter[1]=U16_TRAIL(composite);
				888	} else {
				889	*starter=(UChar)composite;
				890	// The composite is shorter than the starter,
				891	// move the intermediate characters forward one.
				892	starterIsSupplementary=FALSE;
				893	q=starter+1;
				894	r=q+1;
				895	while(r<pRemove) {
				896	q++=r++;
				897	}
				898	--pRemove;
				899	}
				900	} else if(U_IS_SUPPLEMENTARY(composite)) {
				901	// The composite is longer than the starter,
				902	// move the intermediate characters back one.
				903	starterIsSupplementary=TRUE;
				904	++starter; // temporarily increment for the loop boundary
				905	q=pRemove;
				906	r=++pRemove;
				907	while(starter<q) {
				908	--r=--q;
				909	}
				910	*starter=U16_TRAIL(composite);
				911	*--starter=U16_LEAD(composite); // undo the temporary increment
				912	} else {
				913	// both are on the BMP
				914	*starter=(UChar)composite;
				915	}
				916
				917	/* remove the combining mark by moving the following text over it */
				918	if(pRemove<p) {
				919	q=pRemove;
				920	r=p;
				921	while(r<limit) {
				922	q++=r++;
				923	}
				924	limit=q;
				925	p=pRemove;
				926	}
				927	// Keep prevCC because we removed the combining mark.
				928
				929	if(p==limit) {
				930	break;
				931	}
				932	// Is the composite a starter that combines forward?
				933	if(compositeAndFwd&1) {
				934	compositionsList=
				935	getCompositionsListForComposite(getNorm16(composite));
				936	} else {
				937	compositionsList=NULL;
				938	}
				939
				940	// We combined; continue with looking for compositions.
				941	continue;
				942	}
				943	}
				944
				945	// no combination this time
				946	prevCC=cc;
				947	if(p==limit) {
				948	break;
				949	}
				950
				951	// If c did not combine, then check if it is a starter.
				952	if(cc==0) {
				953	// Found a new starter.
				954	if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
				955	// It may combine with something, prepare for it.
				956	if(U_IS_BMP(c)) {
				957	starterIsSupplementary=FALSE;
				958	starter=p-1;
				959	} else {
				960	starterIsSupplementary=TRUE;
				961	starter=p-2;
				962	}
				963	}
				964	} else if(onlyContiguous) {
				965	// FCC: no discontiguous compositions; any intervening character blocks.
				966	compositionsList=NULL;
				967	}
				968	}
				969	buffer.setReorderingLimit(limit);
				970	}
				971
				972	UChar32
				973	Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
				974	uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
				975	const uint16_t *list;
				976	if(isInert(norm16)) {
				977	return U_SENTINEL;
				978	} else if(norm16<minYesNoMappingsOnly) {
				979	if(isJamoL(norm16)) {
				980	b-=Hangul::JAMO_V_BASE;
				981	if(0<=b && b<Hangul::JAMO_V_COUNT) {
				982	return
				983	(Hangul::HANGUL_BASE+
				984	((a-Hangul::JAMO_L_BASE)Hangul::JAMO_V_COUNT+b)
				985	Hangul::JAMO_T_COUNT);
				986	} else {
				987	return U_SENTINEL;
				988	}
				989	} else if(isHangul(norm16)) {
				990	b-=Hangul::JAMO_T_BASE;
				991	if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
				992	return a+b;
				993	} else {
				994	return U_SENTINEL;
				995	}
				996	} else {
				997	// 'a' has a compositions list in extraData
				998	list=extraData+norm16;
				999	if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
				1000	list+= // mapping pointer
				1001	1+ // +1 to skip the first unit with the mapping lenth
				1002	(*list&MAPPING_LENGTH_MASK); // + mapping length
				1003	}
				1004	}
				1005	} else if(norm16<minMaybeYes \|\| MIN_NORMAL_MAYBE_YES<=norm16) {
				1006	return U_SENTINEL;
				1007	} else {
				1008	list=maybeYesCompositions+norm16-minMaybeYes;
				1009	}
				1010	if(b<0 \|\| 0x10ffff<b) { // combine(list, b) requires a valid code point b
				1011	return U_SENTINEL;
				1012	}
				1013	#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
				1014	return combine(list, b)>>1;
				1015	#else
				1016	int32_t compositeAndFwd=combine(list, b);
				1017	return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
				1018	#endif
				1019	}
				1020
				1021	// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
				1022	// doCompose: normalize
				1023	// !doCompose: isNormalized (buffer must be empty and initialized)
				1024	UBool
				1025	Normalizer2Impl::compose(const UChar src, const UChar limit,
				1026	UBool onlyContiguous,
				1027	UBool doCompose,
				1028	ReorderingBuffer &buffer,
				1029	UErrorCode &errorCode) const {
				1030	/*
				1031	* prevBoundary points to the last character before the current one
				1032	* that has a composition boundary before it with ccc==0 and quick check "yes".
				1033	* Keeping track of prevBoundary saves us looking for a composition boundary
				1034	* when we find a "no" or "maybe".
				1035	*
				1036	* When we back out from prevSrc back to prevBoundary,
				1037	* then we also remove those same characters (which had been simply copied
				1038	* or canonically-order-inserted) from the ReorderingBuffer.
				1039	* Therefore, at all times, the [prevBoundary..prevSrc[ source units
				1040	* must correspond 1:1 to destination units at the end of the destination buffer.
				1041	*/
				1042	const UChar *prevBoundary=src;
				1043	UChar32 minNoMaybeCP=minCompNoMaybeCP;
				1044	if(limit==NULL) {
				1045	src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
				1046	doCompose ? &buffer : NULL,
				1047	errorCode);
				1048	if(U_FAILURE(errorCode)) {
				1049	return FALSE;
				1050	}
				1051	if(prevBoundary<src) {
				1052	// Set prevBoundary to the last character in the prefix.
				1053	prevBoundary=src-1;
				1054	}
				1055	limit=u_strchr(src, 0);
				1056	}
				1057
				1058	const UChar *prevSrc;
				1059	UChar32 c=0;
				1060	uint16_t norm16=0;
				1061
				1062	// only for isNormalized
				1063	uint8_t prevCC=0;
				1064
				1065	for(;;) {
				1066	// count code units below the minimum or with irrelevant data for the quick check
				1067	for(prevSrc=src; src!=limit;) {
				1068	if( (c=*src)<minNoMaybeCP \|\|
				1069	isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
				1070	) {
				1071	++src;
				1072	} else if(!U16_IS_SURROGATE(c)) {
				1073	break;
				1074	} else {
				1075	UChar c2;
				1076	if(U16_IS_SURROGATE_LEAD(c)) {
				1077	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				1078	c=U16_GET_SUPPLEMENTARY(c, c2);
				1079	}
				1080	} else /* trail surrogate */ {
				1081	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				1082	--src;
				1083	c=U16_GET_SUPPLEMENTARY(c2, c);
				1084	}
				1085	}
				1086	if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
				1087	src+=U16_LENGTH(c);
				1088	} else {
				1089	break;
				1090	}
				1091	}
				1092	}
				1093	// copy these code units all at once
				1094	if(src!=prevSrc) {
				1095	if(doCompose) {
				1096	if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
				1097	break;
				1098	}
				1099	} else {
				1100	prevCC=0;
				1101	}
				1102	if(src==limit) {
				1103	break;
				1104	}
				1105	// Set prevBoundary to the last character in the quick check loop.
				1106	prevBoundary=src-1;
				1107	if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
				1108	U16_IS_LEAD(*(prevBoundary-1))
				1109	) {
				1110	--prevBoundary;
				1111	}
				1112	// The start of the current character (c).
				1113	prevSrc=src;
				1114	} else if(src==limit) {
				1115	break;
				1116	}
				1117
				1118	src+=U16_LENGTH(c);
				1119	/*
				1120	* isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
				1121	* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
				1122	* or has ccc!=0.
				1123	* Check for Jamo V/T, then for regular characters.
				1124	* c is not a Hangul syllable or Jamo L because those have "yes" properties.
				1125	*/
				1126	if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
				1127	UChar prev=*(prevSrc-1);
				1128	UBool needToDecompose=FALSE;
				1129	if(c<Hangul::JAMO_T_BASE) {
				1130	// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
				1131	prev=(UChar)(prev-Hangul::JAMO_L_BASE);
				1132	if(prev<Hangul::JAMO_L_COUNT) {
				1133	if(!doCompose) {
				1134	return FALSE;
				1135	}
				1136	UChar syllable=(UChar)
				1137	(Hangul::HANGUL_BASE+
				1138	(prevHangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))
				1139	Hangul::JAMO_T_COUNT);
				1140	UChar t;
				1141	if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
				1142	++src;
				1143	syllable+=t; // The next character was a Jamo T.
				1144	prevBoundary=src;
				1145	buffer.setLastChar(syllable);
				1146	continue;
				1147	}
				1148	// If we see L+V+x where x!=T then we drop to the slow path,
				1149	// decompose and recompose.
				1150	// This is to deal with NFKC finding normal L and V but a
				1151	// compatibility variant of a T. We need to either fully compose that
				1152	// combination here (which would complicate the code and may not work
				1153	// with strange custom data) or use the slow path -- or else our replacing
				1154	// two input characters (L+V) with one output character (LV syllable)
				1155	// would violate the invariant that [prevBoundary..prevSrc[ has the same
				1156	// length as what we appended to the buffer since prevBoundary.
				1157	needToDecompose=TRUE;
				1158	}
				1159	} else if(Hangul::isHangulWithoutJamoT(prev)) {
				1160	// c is a Jamo Trailing consonant,
				1161	// compose with previous Hangul LV that does not contain a Jamo T.
				1162	if(!doCompose) {
				1163	return FALSE;
				1164	}
				1165	buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
				1166	prevBoundary=src;
				1167	continue;
				1168	}
				1169	if(!needToDecompose) {
				1170	// The Jamo V/T did not compose into a Hangul syllable.
				1171	if(doCompose) {
				1172	if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
				1173	break;
				1174	}
				1175	} else {
				1176	prevCC=0;
				1177	}
				1178	continue;
				1179	}
				1180	}
				1181	/*
				1182	* Source buffer pointers:
				1183	*
				1184	* all done quick check current char not yet
				1185	* "yes" but (c) processed
				1186	* may combine
				1187	* forward
				1188	* [-------------[-------------[-------------[-------------[
				1189	* \| \| \| \| \|
				1190	* orig. src prevBoundary prevSrc src limit
				1191	*
				1192	*
				1193	* Destination buffer pointers inside the ReorderingBuffer:
				1194	*
				1195	* all done might take not filled yet
				1196	* characters for
				1197	* reordering
				1198	* [-------------[-------------[-------------[
				1199	* \| \| \| \|
				1200	* start reorderStart limit \|
				1201	* +remainingCap.+
				1202	*/
				1203	if(norm16>=MIN_YES_YES_WITH_CC) {
				1204	uint8_t cc=(uint8_t)norm16; // cc!=0
				1205	if( onlyContiguous && // FCC
				1206	(doCompose ? buffer.getLastCC() : prevCC)==0 &&
				1207	prevBoundary<prevSrc &&
				1208	// buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
				1209	// [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
				1210	// passed the quick check "yes && ccc==0" test.
				1211	// Check whether the last character was a "yesYes" or a "yesNo".
				1212	// If a "yesNo", then we get its trailing ccc from its
				1213	// mapping and check for canonical order.
				1214	// All other cases are ok.
				1215	getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
				1216	) {
				1217	// Fails FCD test, need to decompose and contiguously recompose.
				1218	if(!doCompose) {
				1219	return FALSE;
				1220	}
				1221	} else if(doCompose) {
				1222	if(!buffer.append(c, cc, errorCode)) {
				1223	break;
				1224	}
				1225	continue;
				1226	} else if(prevCC<=cc) {
				1227	prevCC=cc;
				1228	continue;
				1229	} else {
				1230	return FALSE;
				1231	}
				1232	} else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
				1233	return FALSE;
				1234	}
				1235
				1236	/*
				1237	* Find appropriate boundaries around this character,
				1238	* decompose the source text from between the boundaries,
				1239	* and recompose it.
				1240	*
				1241	* We may need to remove the last few characters from the ReorderingBuffer
				1242	* to account for source text that was copied or appended
				1243	* but needs to take part in the recomposition.
				1244	*/
				1245
				1246	/*
				1247	* Find the last composition boundary in [prevBoundary..src[.
				1248	* It is either the decomposition of the current character (at prevSrc),
				1249	* or prevBoundary.
				1250	*/
				1251	if(hasCompBoundaryBefore(c, norm16)) {
				1252	prevBoundary=prevSrc;
				1253	} else if(doCompose) {
				1254	buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
				1255	}
				1256
				1257	// Find the next composition boundary in [src..limit[ -
				1258	// modifies src to point to the next starter.
				1259	src=(UChar *)findNextCompBoundary(src, limit);
				1260
				1261	// Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
				1262	int32_t recomposeStartIndex=buffer.length();
				1263	if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
				1264	break;
				1265	}
				1266	recompose(buffer, recomposeStartIndex, onlyContiguous);
				1267	if(!doCompose) {
				1268	if(!buffer.equals(prevBoundary, src)) {
				1269	return FALSE;
				1270	}
				1271	buffer.remove();
				1272	prevCC=0;
				1273	}
				1274
				1275	// Move to the next starter. We never need to look back before this point again.
				1276	prevBoundary=src;
				1277	}
				1278	return TRUE;
				1279	}
				1280
				1281	// Very similar to compose(): Make the same changes in both places if relevant.
				1282	// pQCResult==NULL: spanQuickCheckYes
				1283	// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
				1284	const UChar *
				1285	Normalizer2Impl::composeQuickCheck(const UChar src, const UChar limit,
				1286	UBool onlyContiguous,
				1287	UNormalizationCheckResult *pQCResult) const {
				1288	/*
				1289	* prevBoundary points to the last character before the current one
				1290	* that has a composition boundary before it with ccc==0 and quick check "yes".
				1291	*/
				1292	const UChar *prevBoundary=src;
				1293	UChar32 minNoMaybeCP=minCompNoMaybeCP;
				1294	if(limit==NULL) {
				1295	UErrorCode errorCode=U_ZERO_ERROR;
				1296	src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
				1297	if(prevBoundary<src) {
				1298	// Set prevBoundary to the last character in the prefix.
				1299	prevBoundary=src-1;
				1300	}
				1301	limit=u_strchr(src, 0);
				1302	}
				1303
				1304	const UChar *prevSrc;
				1305	UChar32 c=0;
				1306	uint16_t norm16=0;
				1307	uint8_t prevCC=0;
				1308
				1309	for(;;) {
				1310	// count code units below the minimum or with irrelevant data for the quick check
				1311	for(prevSrc=src;;) {
				1312	if(src==limit) {
				1313	return src;
				1314	}
				1315	if( (c=*src)<minNoMaybeCP \|\|
				1316	isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
				1317	) {
				1318	++src;
				1319	} else if(!U16_IS_SURROGATE(c)) {
				1320	break;
				1321	} else {
				1322	UChar c2;
				1323	if(U16_IS_SURROGATE_LEAD(c)) {
				1324	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				1325	c=U16_GET_SUPPLEMENTARY(c, c2);
				1326	}
				1327	} else /* trail surrogate */ {
				1328	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				1329	--src;
				1330	c=U16_GET_SUPPLEMENTARY(c2, c);
				1331	}
				1332	}
				1333	if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
				1334	src+=U16_LENGTH(c);
				1335	} else {
				1336	break;
				1337	}
				1338	}
				1339	}
				1340	if(src!=prevSrc) {
				1341	// Set prevBoundary to the last character in the quick check loop.
				1342	prevBoundary=src-1;
				1343	if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
				1344	U16_IS_LEAD(*(prevBoundary-1))
				1345	) {
				1346	--prevBoundary;
				1347	}
				1348	prevCC=0;
				1349	// The start of the current character (c).
				1350	prevSrc=src;
				1351	}
				1352
				1353	src+=U16_LENGTH(c);
				1354	/*
				1355	* isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
				1356	* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
				1357	* or has ccc!=0.
				1358	*/
				1359	if(isMaybeOrNonZeroCC(norm16)) {
				1360	uint8_t cc=getCCFromYesOrMaybe(norm16);
				1361	if( onlyContiguous && // FCC
				1362	cc!=0 &&
				1363	prevCC==0 &&
				1364	prevBoundary<prevSrc &&
				1365	// prevCC==0 && prevBoundary<prevSrc tell us that
				1366	// [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
				1367	// passed the quick check "yes && ccc==0" test.
				1368	// Check whether the last character was a "yesYes" or a "yesNo".
				1369	// If a "yesNo", then we get its trailing ccc from its
				1370	// mapping and check for canonical order.
				1371	// All other cases are ok.
				1372	getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
				1373	) {
				1374	// Fails FCD test.
				1375	} else if(prevCC<=cc \|\| cc==0) {
				1376	prevCC=cc;
				1377	if(norm16<MIN_YES_YES_WITH_CC) {
				1378	if(pQCResult!=NULL) {
				1379	*pQCResult=UNORM_MAYBE;
				1380	} else {
				1381	return prevBoundary;
				1382	}
				1383	}
				1384	continue;
				1385	}
				1386	}
				1387	if(pQCResult!=NULL) {
				1388	*pQCResult=UNORM_NO;
				1389	}
				1390	return prevBoundary;
				1391	}
				1392	}
				1393
				1394	void Normalizer2Impl::composeAndAppend(const UChar src, const UChar limit,
				1395	UBool doCompose,
				1396	UBool onlyContiguous,
				1397	UnicodeString &safeMiddle,
				1398	ReorderingBuffer &buffer,
				1399	UErrorCode &errorCode) const {
				1400	if(!buffer.isEmpty()) {
				1401	const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
				1402	if(src!=firstStarterInSrc) {
				1403	const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
				1404	buffer.getLimit());
				1405	int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
				1406	UnicodeString middle(lastStarterInDest, destSuffixLength);
				1407	buffer.removeSuffix(destSuffixLength);
				1408	safeMiddle=middle;
				1409	middle.append(src, (int32_t)(firstStarterInSrc-src));
				1410	const UChar *middleStart=middle.getBuffer();
				1411	compose(middleStart, middleStart+middle.length(), onlyContiguous,
				1412	TRUE, buffer, errorCode);
				1413	if(U_FAILURE(errorCode)) {
				1414	return;
				1415	}
				1416	src=firstStarterInSrc;
				1417	}
				1418	}
				1419	if(doCompose) {
				1420	compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
				1421	} else {
				1422	if(limit==NULL) { // appendZeroCC() needs limit!=NULL
				1423	limit=u_strchr(src, 0);
				1424	}
				1425	buffer.appendZeroCC(src, limit, errorCode);
				1426	}
				1427	}
				1428
				1429	/**
				1430	* Does c have a composition boundary before it?
				1431	* True if its decomposition begins with a character that has
				1432	* ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
				1433	* As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
				1434	* (isCompYesAndZeroCC()) so we need not decompose.
				1435	*/
				1436	UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
				1437	for(;;) {
				1438	if(isCompYesAndZeroCC(norm16)) {
				1439	return TRUE;
				1440	} else if(isMaybeOrNonZeroCC(norm16)) {
				1441	return FALSE;
				1442	} else if(isDecompNoAlgorithmic(norm16)) {
				1443	c=mapAlgorithmic(c, norm16);
				1444	norm16=getNorm16(c);
				1445	} else {
				1446	// c decomposes, get everything from the variable-length extra data
				1447	const uint16_t *mapping=getMapping(norm16);
				1448	uint16_t firstUnit=*mapping;
				1449	if((firstUnit&MAPPING_LENGTH_MASK)==0) {
				1450	return FALSE;
				1451	}
				1452	if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
				1453	return FALSE; // non-zero leadCC
				1454	}
				1455	int32_t i=1; // skip over the firstUnit
				1456	UChar32 c;
				1457	U16_NEXT_UNSAFE(mapping, i, c);
				1458	return isCompYesAndZeroCC(getNorm16(c));
				1459	}
				1460	}
				1461	}
				1462
				1463	UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
				1464	for(;;) {
				1465	uint16_t norm16=getNorm16(c);
				1466	if(isInert(norm16)) {
				1467	return TRUE;
				1468	} else if(norm16<=minYesNo) {
				1469	// Hangul: norm16==minYesNo
				1470	// Hangul LVT has a boundary after it.
				1471	// Hangul LV and non-inert yesYes characters combine forward.
				1472	return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
				1473	} else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
				1474	return FALSE;
				1475	} else if(isDecompNoAlgorithmic(norm16)) {
				1476	c=mapAlgorithmic(c, norm16);
				1477	} else {
				1478	// c decomposes, get everything from the variable-length extra data.
				1479	// If testInert, then c must be a yesNo character which has lccc=0,
				1480	// otherwise it could be a noNo.
				1481	const uint16_t *mapping=getMapping(norm16);
				1482	uint16_t firstUnit=*mapping;
				1483	// TRUE if
				1484	// not MAPPING_NO_COMP_BOUNDARY_AFTER
				1485	// (which is set if
				1486	// c is not deleted, and
				1487	// it and its decomposition do not combine forward, and it has a starter)
				1488	// and if FCC then trailCC<=1
				1489	return
				1490	(firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
				1491	(!onlyContiguous \|\| firstUnit<=0x1ff);
				1492	}
				1493	}
				1494	}
				1495
				1496	const UChar Normalizer2Impl::findPreviousCompBoundary(const UChar start, const UChar *p) const {
				1497	BackwardUTrie2StringIterator iter(normTrie, start, p);
				1498	uint16_t norm16;
				1499	do {
				1500	norm16=iter.previous16();
				1501	} while(!hasCompBoundaryBefore(iter.codePoint, norm16));
				1502	// We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
				1503	// but that's probably not worth the extra cost.
				1504	return iter.codePointStart;
				1505	}
				1506
				1507	const UChar Normalizer2Impl::findNextCompBoundary(const UChar p, const UChar *limit) const {
				1508	ForwardUTrie2StringIterator iter(normTrie, p, limit);
				1509	uint16_t norm16;
				1510	do {
				1511	norm16=iter.next16();
				1512	} while(!hasCompBoundaryBefore(iter.codePoint, norm16));
				1513	return iter.codePointStart;
				1514	}
				1515
				1516	// Note: normalizer2impl.cpp r30982 (2011-nov-27)
				1517	// still had getFCDTrie() which built and cached an FCD trie.
				1518	// That provided faster access to FCD data than getFCD16FromNormData()
				1519	// but required synchronization and consumed some 10kB of heap memory
				1520	// in any process that uses FCD (e.g., via collation).
				1521	// tccc180[] and smallFCD[] are intended to help with any loss of performance,
				1522	// at least for Latin & CJK.
				1523
				1524	// Gets the FCD value from the regular normalization data.
				1525	uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
				1526	// Only loops for 1:1 algorithmic mappings.
				1527	for(;;) {
				1528	uint16_t norm16=getNorm16(c);
				1529	if(norm16<=minYesNo) {
				1530	// no decomposition or Hangul syllable, all zeros
				1531	return 0;
				1532	} else if(norm16>=MIN_NORMAL_MAYBE_YES) {
				1533	// combining mark
				1534	norm16&=0xff;
				1535	return norm16\|(norm16<<8);
				1536	} else if(norm16>=minMaybeYes) {
				1537	return 0;
				1538	} else if(isDecompNoAlgorithmic(norm16)) {
				1539	c=mapAlgorithmic(c, norm16);
				1540	} else {
				1541	// c decomposes, get everything from the variable-length extra data
				1542	const uint16_t *mapping=getMapping(norm16);
				1543	uint16_t firstUnit=*mapping;
				1544	if((firstUnit&MAPPING_LENGTH_MASK)==0) {
				1545	// A character that is deleted (maps to an empty string) must
				1546	// get the worst-case lccc and tccc values because arbitrary
				1547	// characters on both sides will become adjacent.
				1548	return 0x1ff;
				1549	} else {
				1550	norm16=firstUnit>>8; // tccc
				1551	if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
				1552	norm16\|=*(mapping-1)&0xff00; // lccc
				1553	}
				1554	return norm16;
				1555	}
				1556	}
				1557	}
				1558	}
				1559
				1560	// Dual functionality:
				1561	// buffer!=NULL: normalize
				1562	// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
				1563	const UChar *
				1564	Normalizer2Impl::makeFCD(const UChar src, const UChar limit,
				1565	ReorderingBuffer *buffer,
				1566	UErrorCode &errorCode) const {
				1567	// Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
				1568	// Similar to the prevBoundary in the compose() implementation.
				1569	const UChar *prevBoundary=src;
				1570	int32_t prevFCD16=0;
				1571	if(limit==NULL) {
				1572	src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
				1573	if(U_FAILURE(errorCode)) {
				1574	return src;
				1575	}
				1576	if(prevBoundary<src) {
				1577	prevBoundary=src;
				1578	// We know that the previous character's lccc==0.
				1579	// Fetching the fcd16 value was deferred for this below-U+0300 code point.
				1580	prevFCD16=getFCD16(*(src-1));
				1581	if(prevFCD16>1) {
				1582	--prevBoundary;
				1583	}
				1584	}
				1585	limit=u_strchr(src, 0);
				1586	}
				1587
				1588	// Note: In this function we use buffer->appendZeroCC() because we track
				1589	// the lead and trail combining classes here, rather than leaving it to
				1590	// the ReorderingBuffer.
				1591	// The exception is the call to decomposeShort() which uses the buffer
				1592	// in the normal way.
				1593
				1594	const UChar *prevSrc;
				1595	UChar32 c=0;
				1596	uint16_t fcd16=0;
				1597
				1598	for(;;) {
				1599	// count code units with lccc==0
				1600	for(prevSrc=src; src!=limit;) {
				1601	if((c=*src)<MIN_CCC_LCCC_CP) {
				1602	prevFCD16=~c;
				1603	++src;
				1604	} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
				1605	prevFCD16=0;
				1606	++src;
				1607	} else {
				1608	if(U16_IS_SURROGATE(c)) {
				1609	UChar c2;
				1610	if(U16_IS_SURROGATE_LEAD(c)) {
				1611	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				1612	c=U16_GET_SUPPLEMENTARY(c, c2);
				1613	}
				1614	} else /* trail surrogate */ {
				1615	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				1616	--src;
				1617	c=U16_GET_SUPPLEMENTARY(c2, c);
				1618	}
				1619	}
				1620	}
				1621	if((fcd16=getFCD16FromNormData(c))<=0xff) {
				1622	prevFCD16=fcd16;
				1623	src+=U16_LENGTH(c);
				1624	} else {
				1625	break;
				1626	}
				1627	}
				1628	}
				1629	// copy these code units all at once
				1630	if(src!=prevSrc) {
				1631	if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
				1632	break;
				1633	}
				1634	if(src==limit) {
				1635	break;
				1636	}
				1637	prevBoundary=src;
				1638	// We know that the previous character's lccc==0.
				1639	if(prevFCD16<0) {
				1640	// Fetching the fcd16 value was deferred for this below-U+0300 code point.
				1641	UChar32 prev=~prevFCD16;
				1642	prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
				1643	if(prevFCD16>1) {
				1644	--prevBoundary;
				1645	}
				1646	} else {
				1647	const UChar *p=src-1;
				1648	if(U16_IS_TRAIL(p) && prevSrc<p && U16_IS_LEAD((p-1))) {
				1649	--p;
				1650	// Need to fetch the previous character's FCD value because
				1651	// prevFCD16 was just for the trail surrogate code point.
				1652	prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
				1653	// Still known to have lccc==0 because its lead surrogate unit had lccc==0.
				1654	}
				1655	if(prevFCD16>1) {
				1656	prevBoundary=p;
				1657	}
				1658	}
				1659	// The start of the current character (c).
				1660	prevSrc=src;
				1661	} else if(src==limit) {
				1662	break;
				1663	}
				1664
				1665	src+=U16_LENGTH(c);
				1666	// The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
				1667	// Check for proper order, and decompose locally if necessary.
				1668	if((prevFCD16&0xff)<=(fcd16>>8)) {
				1669	// proper order: prev tccc <= current lccc
				1670	if((fcd16&0xff)<=1) {
				1671	prevBoundary=src;
				1672	}
				1673	if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
				1674	break;
				1675	}
				1676	prevFCD16=fcd16;
				1677	continue;
				1678	} else if(buffer==NULL) {
				1679	return prevBoundary; // quick check "no"
				1680	} else {
				1681	/*
				1682	* Back out the part of the source that we copied or appended
				1683	* already but is now going to be decomposed.
				1684	* prevSrc is set to after what was copied/appended.
				1685	*/
				1686	buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
				1687	/*
				1688	* Find the part of the source that needs to be decomposed,
				1689	* up to the next safe boundary.
				1690	*/
				1691	src=findNextFCDBoundary(src, limit);
				1692	/*
				1693	* The source text does not fulfill the conditions for FCD.
				1694	* Decompose and reorder a limited piece of the text.
				1695	*/
				1696	if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
				1697	break;
				1698	}
				1699	prevBoundary=src;
				1700	prevFCD16=0;
				1701	}
				1702	}
				1703	return src;
				1704	}
				1705
				1706	void Normalizer2Impl::makeFCDAndAppend(const UChar src, const UChar limit,
				1707	UBool doMakeFCD,
				1708	UnicodeString &safeMiddle,
				1709	ReorderingBuffer &buffer,
				1710	UErrorCode &errorCode) const {
				1711	if(!buffer.isEmpty()) {
				1712	const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
				1713	if(src!=firstBoundaryInSrc) {
				1714	const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
				1715	buffer.getLimit());
				1716	int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
				1717	UnicodeString middle(lastBoundaryInDest, destSuffixLength);
				1718	buffer.removeSuffix(destSuffixLength);
				1719	safeMiddle=middle;
				1720	middle.append(src, (int32_t)(firstBoundaryInSrc-src));
				1721	const UChar *middleStart=middle.getBuffer();
				1722	makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
				1723	if(U_FAILURE(errorCode)) {
				1724	return;
				1725	}
				1726	src=firstBoundaryInSrc;
				1727	}
				1728	}
				1729	if(doMakeFCD) {
				1730	makeFCD(src, limit, &buffer, errorCode);
				1731	} else {
				1732	if(limit==NULL) { // appendZeroCC() needs limit!=NULL
				1733	limit=u_strchr(src, 0);
				1734	}
				1735	buffer.appendZeroCC(src, limit, errorCode);
				1736	}
				1737	}
				1738
				1739	const UChar Normalizer2Impl::findPreviousFCDBoundary(const UChar start, const UChar *p) const {
				1740	while(start<p && previousFCD16(start, p)>0xff) {}
				1741	return p;
				1742	}
				1743
				1744	const UChar Normalizer2Impl::findNextFCDBoundary(const UChar p, const UChar *limit) const {
				1745	while(p<limit) {
				1746	const UChar *codePointStart=p;
				1747	if(nextFCD16(p, limit)<=0xff) {
				1748	return codePointStart;
				1749	}
				1750	}
				1751	return p;
				1752	}
				1753
				1754	// CanonicalIterator data -------------------------------------------------- ***
				1755
				1756	CanonIterData::CanonIterData(UErrorCode &errorCode) :
				1757	trie(utrie2_open(0, 0, &errorCode)),
				1758	canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
				1759
				1760	CanonIterData::~CanonIterData() {
				1761	utrie2_close(trie);
				1762	}
				1763
				1764	void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
				1765	uint32_t canonValue=utrie2_get32(trie, decompLead);
				1766	if((canonValue&(CANON_HAS_SET\|CANON_VALUE_MASK))==0 && origin!=0) {
				1767	// origin is the first character whose decomposition starts with
				1768	// the character for which we are setting the value.
				1769	utrie2_set32(trie, decompLead, canonValue\|origin, &errorCode);
				1770	} else {
				1771	// origin is not the first character, or it is U+0000.
				1772	UnicodeSet *set;
				1773	if((canonValue&CANON_HAS_SET)==0) {
				1774	set=new UnicodeSet;
				1775	if(set==NULL) {
				1776	errorCode=U_MEMORY_ALLOCATION_ERROR;
				1777	return;
				1778	}
				1779	UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
				1780	canonValue=(canonValue&~CANON_VALUE_MASK)\|CANON_HAS_SET\|(uint32_t)canonStartSets.size();
				1781	utrie2_set32(trie, decompLead, canonValue, &errorCode);
				1782	canonStartSets.addElement(set, errorCode);
				1783	if(firstOrigin!=0) {
				1784	set->add(firstOrigin);
				1785	}
				1786	} else {
				1787	set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
				1788	}
				1789	set->add(origin);
				1790	}
				1791	}
				1792
				1793	U_CDECL_BEGIN
				1794
				1795	// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
				1796	// context: the Normalizer2Impl
				1797	static UBool U_CALLCONV
				1798	enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
				1799	UErrorCode errorCode = U_ZERO_ERROR;
				1800	if (value != 0) {
				1801	Normalizer2Impl impl = (Normalizer2Impl )context;
				1802	impl->makeCanonIterDataFromNorm16(
				1803	start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
				1804	}
				1805	return U_SUCCESS(errorCode);
				1806	}
				1807
				1808
				1809
				1810	// UInitOnce instantiation function for CanonIterData
				1811
				1812	static void U_CALLCONV
				1813	initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
				1814	U_ASSERT(impl->fCanonIterData == NULL);
				1815	impl->fCanonIterData = new CanonIterData(errorCode);
				1816	if (impl->fCanonIterData == NULL) {
				1817	errorCode=U_MEMORY_ALLOCATION_ERROR;
				1818	}
				1819	if (U_SUCCESS(errorCode)) {
				1820	utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
				1821	utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
				1822	}
				1823	if (U_FAILURE(errorCode)) {
				1824	delete impl->fCanonIterData;
				1825	impl->fCanonIterData = NULL;
				1826	}
				1827	}
				1828
				1829	U_CDECL_END
				1830
				1831	void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
				1832	CanonIterData &newData,
				1833	UErrorCode &errorCode) const {
				1834	if(norm16==0 \|\| (minYesNo<=norm16 && norm16<minNoNo)) {
				1835	// Inert, or 2-way mapping (including Hangul syllable).
				1836	// We do not write a canonStartSet for any yesNo character.
				1837	// Composites from 2-way mappings are added at runtime from the
				1838	// starter's compositions list, and the other characters in
				1839	// 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
				1840	// "maybe" characters.
				1841	return;
				1842	}
				1843	for(UChar32 c=start; c<=end; ++c) {
				1844	uint32_t oldValue=utrie2_get32(newData.trie, c);
				1845	uint32_t newValue=oldValue;
				1846	if(norm16>=minMaybeYes) {
				1847	// not a segment starter if it occurs in a decomposition or has cc!=0
				1848	newValue\|=CANON_NOT_SEGMENT_STARTER;
				1849	if(norm16<MIN_NORMAL_MAYBE_YES) {
				1850	newValue\|=CANON_HAS_COMPOSITIONS;
				1851	}
				1852	} else if(norm16<minYesNo) {
				1853	newValue\|=CANON_HAS_COMPOSITIONS;
				1854	} else {
				1855	// c has a one-way decomposition
				1856	UChar32 c2=c;
				1857	uint16_t norm16_2=norm16;
				1858	while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
				1859	c2=mapAlgorithmic(c2, norm16_2);
				1860	norm16_2=getNorm16(c2);
				1861	}
				1862	if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
				1863	// c decomposes, get everything from the variable-length extra data
				1864	const uint16_t *mapping=getMapping(norm16_2);
				1865	uint16_t firstUnit=*mapping;
				1866	int32_t length=firstUnit&MAPPING_LENGTH_MASK;
				1867	if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
				1868	if(c==c2 && (*(mapping-1)&0xff)!=0) {
				1869	newValue\|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
				1870	}
				1871	}
				1872	// Skip empty mappings (no characters in the decomposition).
				1873	if(length!=0) {
				1874	++mapping; // skip over the firstUnit
				1875	// add c to first code point's start set
				1876	int32_t i=0;
				1877	U16_NEXT_UNSAFE(mapping, i, c2);
				1878	newData.addToStartSet(c, c2, errorCode);
				1879	// Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
				1880	// one-way mapping. A 2-way mapping is possible here after
				1881	// intermediate algorithmic mapping.
				1882	if(norm16_2>=minNoNo) {
				1883	while(i<length) {
				1884	U16_NEXT_UNSAFE(mapping, i, c2);
				1885	uint32_t c2Value=utrie2_get32(newData.trie, c2);
				1886	if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
				1887	utrie2_set32(newData.trie, c2, c2Value\|CANON_NOT_SEGMENT_STARTER,
				1888	&errorCode);
				1889	}
				1890	}
				1891	}
				1892	}
				1893	} else {
				1894	// c decomposed to c2 algorithmically; c has cc==0
				1895	newData.addToStartSet(c, c2, errorCode);
				1896	}
				1897	}
				1898	if(newValue!=oldValue) {
				1899	utrie2_set32(newData.trie, c, newValue, &errorCode);
				1900	}
				1901	}
				1902	}
				1903
				1904	UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
				1905	// Logically const: Synchronized instantiation.
				1906	Normalizer2Impl me=const_cast<Normalizer2Impl >(this);
				1907	umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
				1908	return U_SUCCESS(errorCode);
				1909	}
				1910
				1911	int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
				1912	return (int32_t)utrie2_get32(fCanonIterData->trie, c);
				1913	}
				1914
				1915	const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
				1916	return (const UnicodeSet )fCanonIterData->canonStartSets[n];
				1917	}
				1918
				1919	UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
				1920	return getCanonValue(c)>=0;
				1921	}
				1922
				1923	UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
				1924	int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
				1925	if(canonValue==0) {
				1926	return FALSE;
				1927	}
				1928	set.clear();
				1929	int32_t value=canonValue&CANON_VALUE_MASK;
				1930	if((canonValue&CANON_HAS_SET)!=0) {
				1931	set.addAll(getCanonStartSet(value));
				1932	} else if(value!=0) {
				1933	set.add(value);
				1934	}
				1935	if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
				1936	uint16_t norm16=getNorm16(c);
				1937	if(norm16==JAMO_L) {
				1938	UChar32 syllable=
				1939	(UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
				1940	set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
				1941	} else {
				1942	addComposites(getCompositionsList(norm16), set);
				1943	}
				1944	}
				1945	return TRUE;
				1946	}
				1947
				1948	U_NAMESPACE_END
				1949
				1950	// Normalizer2 data swapping ----------------------------------------------- ***
				1951
				1952	U_NAMESPACE_USE
				1953
				1954	U_CAPI int32_t U_EXPORT2
				1955	unorm2_swap(const UDataSwapper *ds,
				1956	const void inData, int32_t length, void outData,
				1957	UErrorCode *pErrorCode) {
				1958	const UDataInfo *pInfo;
				1959	int32_t headerSize;
				1960
				1961	const uint8_t *inBytes;
				1962	uint8_t *outBytes;
				1963
				1964	const int32_t *inIndexes;
				1965	int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
				1966
				1967	int32_t i, offset, nextOffset, size;
				1968
				1969	/* udata_swapDataHeader checks the arguments */
				1970	headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
				1971	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1972	return 0;
				1973	}
				1974
				1975	/* check data format and format version */
				1976	pInfo=(const UDataInfo )((const char )inData+4);
				1977	if(!(
				1978	pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
				1979	pInfo->dataFormat[1]==0x72 &&
				1980	pInfo->dataFormat[2]==0x6d &&
				1981	pInfo->dataFormat[3]==0x32 &&
				1982	(pInfo->formatVersion[0]==1 \|\| pInfo->formatVersion[0]==2)
				1983	)) {
				1984	udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
				1985	pInfo->dataFormat[0], pInfo->dataFormat[1],
				1986	pInfo->dataFormat[2], pInfo->dataFormat[3],
				1987	pInfo->formatVersion[0]);
				1988	*pErrorCode=U_UNSUPPORTED_ERROR;
				1989	return 0;
				1990	}
				1991
				1992	inBytes=(const uint8_t *)inData+headerSize;
				1993	outBytes=(uint8_t *)outData+headerSize;
				1994
				1995	inIndexes=(const int32_t *)inBytes;
				1996
				1997	if(length>=0) {
				1998	length-=headerSize;
				1999	if(length<(int32_t)sizeof(indexes)) {
				2000	udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
				2001	length);
				2002	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				2003	return 0;
				2004	}
				2005	}
				2006
				2007	/* read the first few indexes */
				2008	for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
				2009	indexes[i]=udata_readInt32(ds, inIndexes[i]);
				2010	}
				2011
				2012	/* get the total length of the data */
				2013	size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
				2014
				2015	if(length>=0) {
				2016	if(length<size) {
				2017	udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
				2018	length);
				2019	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				2020	return 0;
				2021	}
				2022
				2023	/* copy the data for inaccessible bytes */
				2024	if(inBytes!=outBytes) {
				2025	uprv_memcpy(outBytes, inBytes, size);
				2026	}
				2027
				2028	offset=0;
				2029
				2030	/* swap the int32_t indexes[] */
				2031	nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
				2032	ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
				2033	offset=nextOffset;
				2034
				2035	/* swap the UTrie2 */
				2036	nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
				2037	utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
				2038	offset=nextOffset;
				2039
				2040	/* swap the uint16_t extraData[] */
				2041	nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
				2042	ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
				2043	offset=nextOffset;
				2044
				2045	/* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
				2046	nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
				2047	offset=nextOffset;
				2048
				2049	U_ASSERT(offset==size);
				2050	}
				2051
				2052	return headerSize+size;
				2053	}
				2054
				2055	#endif // !UCONFIG_NO_NORMALIZATION