blob: 9c00c1c818d12aff921fe62a82a14b6517ff527b [file] [log] [blame]
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001/*
2*******************************************************************************
3*
4* Copyright (C) 2009-2013, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: normalizer2impl.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2009nov22
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/normalizer2.h"
22#include "unicode/udata.h"
23#include "unicode/ustring.h"
24#include "unicode/utf16.h"
25#include "cmemory.h"
26#include "mutex.h"
27#include "normalizer2impl.h"
28#include "putilimp.h"
29#include "uassert.h"
30#include "uset_imp.h"
31#include "utrie2.h"
32#include "uvector.h"
33
34U_NAMESPACE_BEGIN
35
36// ReorderingBuffer -------------------------------------------------------- ***
37
38UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
39 int32_t length=str.length();
40 start=str.getBuffer(destCapacity);
41 if(start==NULL) {
42 // getBuffer() already did str.setToBogus()
43 errorCode=U_MEMORY_ALLOCATION_ERROR;
44 return FALSE;
45 }
46 limit=start+length;
47 remainingCapacity=str.getCapacity()-length;
48 reorderStart=start;
49 if(start==limit) {
50 lastCC=0;
51 } else {
52 setIterator();
53 lastCC=previousCC();
54 // Set reorderStart after the last code point with cc<=1 if there is one.
55 if(lastCC>1) {
56 while(previousCC()>1) {}
57 }
58 reorderStart=codePointLimit;
59 }
60 return TRUE;
61}
62
63UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
64 int32_t length=(int32_t)(limit-start);
65 return
66 length==(int32_t)(otherLimit-otherStart) &&
67 0==u_memcmp(start, otherStart, length);
68}
69
70UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
71 if(remainingCapacity<2 && !resize(2, errorCode)) {
72 return FALSE;
73 }
74 if(lastCC<=cc || cc==0) {
75 limit[0]=U16_LEAD(c);
76 limit[1]=U16_TRAIL(c);
77 limit+=2;
78 lastCC=cc;
79 if(cc<=1) {
80 reorderStart=limit;
81 }
82 } else {
83 insert(c, cc);
84 }
85 remainingCapacity-=2;
86 return TRUE;
87}
88
89UBool ReorderingBuffer::append(const UChar *s, int32_t length,
90 uint8_t leadCC, uint8_t trailCC,
91 UErrorCode &errorCode) {
92 if(length==0) {
93 return TRUE;
94 }
95 if(remainingCapacity<length && !resize(length, errorCode)) {
96 return FALSE;
97 }
98 remainingCapacity-=length;
99 if(lastCC<=leadCC || leadCC==0) {
100 if(trailCC<=1) {
101 reorderStart=limit+length;
102 } else if(leadCC<=1) {
103 reorderStart=limit+1; // Ok if not a code point boundary.
104 }
105 const UChar *sLimit=s+length;
106 do { *limit++=*s++; } while(s!=sLimit);
107 lastCC=trailCC;
108 } else {
109 int32_t i=0;
110 UChar32 c;
111 U16_NEXT(s, i, length, c);
112 insert(c, leadCC); // insert first code point
113 while(i<length) {
114 U16_NEXT(s, i, length, c);
115 if(i<length) {
116 // s must be in NFD, otherwise we need to use getCC().
117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
118 } else {
119 leadCC=trailCC;
120 }
121 append(c, leadCC, errorCode);
122 }
123 }
124 return TRUE;
125}
126
127UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
128 int32_t cpLength=U16_LENGTH(c);
129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
130 return FALSE;
131 }
132 remainingCapacity-=cpLength;
133 if(cpLength==1) {
134 *limit++=(UChar)c;
135 } else {
136 limit[0]=U16_LEAD(c);
137 limit[1]=U16_TRAIL(c);
138 limit+=2;
139 }
140 lastCC=0;
141 reorderStart=limit;
142 return TRUE;
143}
144
145UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
146 if(s==sLimit) {
147 return TRUE;
148 }
149 int32_t length=(int32_t)(sLimit-s);
150 if(remainingCapacity<length && !resize(length, errorCode)) {
151 return FALSE;
152 }
153 u_memcpy(limit, s, length);
154 limit+=length;
155 remainingCapacity-=length;
156 lastCC=0;
157 reorderStart=limit;
158 return TRUE;
159}
160
161void ReorderingBuffer::remove() {
162 reorderStart=limit=start;
163 remainingCapacity=str.getCapacity();
164 lastCC=0;
165}
166
167void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
168 if(suffixLength<(limit-start)) {
169 limit-=suffixLength;
170 remainingCapacity+=suffixLength;
171 } else {
172 limit=start;
173 remainingCapacity=str.getCapacity();
174 }
175 lastCC=0;
176 reorderStart=limit;
177}
178
179UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
180 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
181 int32_t length=(int32_t)(limit-start);
182 str.releaseBuffer(length);
183 int32_t newCapacity=length+appendLength;
184 int32_t doubleCapacity=2*str.getCapacity();
185 if(newCapacity<doubleCapacity) {
186 newCapacity=doubleCapacity;
187 }
188 if(newCapacity<256) {
189 newCapacity=256;
190 }
191 start=str.getBuffer(newCapacity);
192 if(start==NULL) {
193 // getBuffer() already did str.setToBogus()
194 errorCode=U_MEMORY_ALLOCATION_ERROR;
195 return FALSE;
196 }
197 reorderStart=start+reorderStartIndex;
198 limit=start+length;
199 remainingCapacity=str.getCapacity()-length;
200 return TRUE;
201}
202
203void ReorderingBuffer::skipPrevious() {
204 codePointLimit=codePointStart;
205 UChar c=*--codePointStart;
206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
207 --codePointStart;
208 }
209}
210
211uint8_t ReorderingBuffer::previousCC() {
212 codePointLimit=codePointStart;
213 if(reorderStart>=codePointStart) {
214 return 0;
215 }
216 UChar32 c=*--codePointStart;
217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
218 return 0;
219 }
220
221 UChar c2;
222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
223 --codePointStart;
224 c=U16_GET_SUPPLEMENTARY(c2, c);
225 }
226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
227}
228
229// Inserts c somewhere before the last character.
230// Requires 0<cc<lastCC which implies reorderStart<limit.
231void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
232 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
233 // insert c at codePointLimit, after the character with prevCC<=cc
234 UChar *q=limit;
235 UChar *r=limit+=U16_LENGTH(c);
236 do {
237 *--r=*--q;
238 } while(codePointLimit!=q);
239 writeCodePoint(q, c);
240 if(cc<=1) {
241 reorderStart=r;
242 }
243}
244
245// Normalizer2Impl --------------------------------------------------------- ***
246
247struct CanonIterData : public UMemory {
248 CanonIterData(UErrorCode &errorCode);
249 ~CanonIterData();
250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
251 UTrie2 *trie;
252 UVector canonStartSets; // contains UnicodeSet *
253};
254
255Normalizer2Impl::~Normalizer2Impl() {
256 udata_close(memory);
257 utrie2_close(normTrie);
258 delete fCanonIterData;
259}
260
261UBool U_CALLCONV
262Normalizer2Impl::isAcceptable(void *context,
263 const char * /* type */, const char * /*name*/,
264 const UDataInfo *pInfo) {
265 if(
266 pInfo->size>=20 &&
267 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
268 pInfo->charsetFamily==U_CHARSET_FAMILY &&
269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
270 pInfo->dataFormat[1]==0x72 &&
271 pInfo->dataFormat[2]==0x6d &&
272 pInfo->dataFormat[3]==0x32 &&
273 pInfo->formatVersion[0]==2
274 ) {
275 Normalizer2Impl *me=(Normalizer2Impl *)context;
276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
277 return TRUE;
278 } else {
279 return FALSE;
280 }
281}
282
283void
284Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
285 if(U_FAILURE(errorCode)) {
286 return;
287 }
288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
289 if(U_FAILURE(errorCode)) {
290 return;
291 }
292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
293 const int32_t *inIndexes=(const int32_t *)inBytes;
294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
295 if(indexesLength<=IX_MIN_MAYBE_YES) {
296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
297 return;
298 }
299
300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
302
303 minYesNo=inIndexes[IX_MIN_YES_NO];
304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
305 minNoNo=inIndexes[IX_MIN_NO_NO];
306 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
308
309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
312 inBytes+offset, nextOffset-offset, NULL,
313 &errorCode);
314 if(U_FAILURE(errorCode)) {
315 return;
316 }
317
318 offset=nextOffset;
319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
320 maybeYesCompositions=(const uint16_t *)(inBytes+offset);
321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
322
323 // smallFCD: new in formatVersion 2
324 offset=nextOffset;
325 smallFCD=inBytes+offset;
326
327 // Build tccc180[].
328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
329 uint8_t bits=0;
330 for(UChar c=0; c<0x180; bits>>=1) {
331 if((c&0xff)==0) {
332 bits=smallFCD[c>>8]; // one byte per 0x100 code points
333 }
334 if(bits&1) {
335 for(int i=0; i<0x20; ++i, ++c) {
336 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
337 }
338 } else {
339 uprv_memset(tccc180+c, 0, 0x20);
340 c+=0x20;
341 }
342 }
343}
344
345uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
346 UChar32 c;
347 if(cpStart==(cpLimit-1)) {
348 c=*cpStart;
349 } else {
350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
351 }
352 uint16_t prevNorm16=getNorm16(c);
353 if(prevNorm16<=minYesNo) {
354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
355 } else {
356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
357 }
358}
359
360U_CDECL_BEGIN
361
362static UBool U_CALLCONV
363enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
364 /* add the start code point to the USet */
365 const USetAdder *sa=(const USetAdder *)context;
366 sa->add(sa->set, start);
367 return TRUE;
368}
369
370static uint32_t U_CALLCONV
371segmentStarterMapper(const void * /*context*/, uint32_t value) {
372 return value&CANON_NOT_SEGMENT_STARTER;
373}
374
375U_CDECL_END
376
377void
378Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
379 /* add the start code point of each same-value range of each trie */
380 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
381
382 /* add Hangul LV syllables and LV+1 because of skippables */
383 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
384 sa->add(sa->set, c);
385 sa->add(sa->set, c+1);
386 }
387 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
388}
389
390void
391Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
392 /* add the start code point of each same-value range of the canonical iterator data trie */
393 if(ensureCanonIterData(errorCode)) {
394 // currently only used for the SEGMENT_STARTER property
395 utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
396 }
397}
398
399const UChar *
400Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
401 UChar32 minNeedDataCP,
402 ReorderingBuffer *buffer,
403 UErrorCode &errorCode) const {
404 // Make some effort to support NUL-terminated strings reasonably.
405 // Take the part of the fast quick check loop that does not look up
406 // data and check the first part of the string.
407 // After this prefix, determine the string length to simplify the rest
408 // of the code.
409 const UChar *prevSrc=src;
410 UChar c;
411 while((c=*src++)<minNeedDataCP && c!=0) {}
412 // Back out the last character for full processing.
413 // Copy this prefix.
414 if(--src!=prevSrc) {
415 if(buffer!=NULL) {
416 buffer->appendZeroCC(prevSrc, src, errorCode);
417 }
418 }
419 return src;
420}
421
422// Dual functionality:
423// buffer!=NULL: normalize
424// buffer==NULL: isNormalized/spanQuickCheckYes
425const UChar *
426Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
427 ReorderingBuffer *buffer,
428 UErrorCode &errorCode) const {
429 UChar32 minNoCP=minDecompNoCP;
430 if(limit==NULL) {
431 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
432 if(U_FAILURE(errorCode)) {
433 return src;
434 }
435 limit=u_strchr(src, 0);
436 }
437
438 const UChar *prevSrc;
439 UChar32 c=0;
440 uint16_t norm16=0;
441
442 // only for quick check
443 const UChar *prevBoundary=src;
444 uint8_t prevCC=0;
445
446 for(;;) {
447 // count code units below the minimum or with irrelevant data for the quick check
448 for(prevSrc=src; src!=limit;) {
449 if( (c=*src)<minNoCP ||
450 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
451 ) {
452 ++src;
453 } else if(!U16_IS_SURROGATE(c)) {
454 break;
455 } else {
456 UChar c2;
457 if(U16_IS_SURROGATE_LEAD(c)) {
458 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
459 c=U16_GET_SUPPLEMENTARY(c, c2);
460 }
461 } else /* trail surrogate */ {
462 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
463 --src;
464 c=U16_GET_SUPPLEMENTARY(c2, c);
465 }
466 }
467 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
468 src+=U16_LENGTH(c);
469 } else {
470 break;
471 }
472 }
473 }
474 // copy these code units all at once
475 if(src!=prevSrc) {
476 if(buffer!=NULL) {
477 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
478 break;
479 }
480 } else {
481 prevCC=0;
482 prevBoundary=src;
483 }
484 }
485 if(src==limit) {
486 break;
487 }
488
489 // Check one above-minimum, relevant code point.
490 src+=U16_LENGTH(c);
491 if(buffer!=NULL) {
492 if(!decompose(c, norm16, *buffer, errorCode)) {
493 break;
494 }
495 } else {
496 if(isDecompYes(norm16)) {
497 uint8_t cc=getCCFromYesOrMaybe(norm16);
498 if(prevCC<=cc || cc==0) {
499 prevCC=cc;
500 if(cc<=1) {
501 prevBoundary=src;
502 }
503 continue;
504 }
505 }
506 return prevBoundary; // "no" or cc out of order
507 }
508 }
509 return src;
510}
511
512// Decompose a short piece of text which is likely to contain characters that
513// fail the quick check loop and/or where the quick check loop's overhead
514// is unlikely to be amortized.
515// Called by the compose() and makeFCD() implementations.
516UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
517 ReorderingBuffer &buffer,
518 UErrorCode &errorCode) const {
519 while(src<limit) {
520 UChar32 c;
521 uint16_t norm16;
522 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
523 if(!decompose(c, norm16, buffer, errorCode)) {
524 return FALSE;
525 }
526 }
527 return TRUE;
528}
529
530UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
531 ReorderingBuffer &buffer,
532 UErrorCode &errorCode) const {
533 // Only loops for 1:1 algorithmic mappings.
534 for(;;) {
535 // get the decomposition and the lead and trail cc's
536 if(isDecompYes(norm16)) {
537 // c does not decompose
538 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
539 } else if(isHangul(norm16)) {
540 // Hangul syllable: decompose algorithmically
541 UChar jamos[3];
542 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
543 } else if(isDecompNoAlgorithmic(norm16)) {
544 c=mapAlgorithmic(c, norm16);
545 norm16=getNorm16(c);
546 } else {
547 // c decomposes, get everything from the variable-length extra data
548 const uint16_t *mapping=getMapping(norm16);
549 uint16_t firstUnit=*mapping;
550 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
551 uint8_t leadCC, trailCC;
552 trailCC=(uint8_t)(firstUnit>>8);
553 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
554 leadCC=(uint8_t)(*(mapping-1)>>8);
555 } else {
556 leadCC=0;
557 }
558 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
559 }
560 }
561}
562
563const UChar *
564Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
565 const UChar *decomp=NULL;
566 uint16_t norm16;
567 for(;;) {
568 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
569 // c does not decompose
570 return decomp;
571 } else if(isHangul(norm16)) {
572 // Hangul syllable: decompose algorithmically
573 length=Hangul::decompose(c, buffer);
574 return buffer;
575 } else if(isDecompNoAlgorithmic(norm16)) {
576 c=mapAlgorithmic(c, norm16);
577 decomp=buffer;
578 length=0;
579 U16_APPEND_UNSAFE(buffer, length, c);
580 } else {
581 // c decomposes, get everything from the variable-length extra data
582 const uint16_t *mapping=getMapping(norm16);
583 length=*mapping&MAPPING_LENGTH_MASK;
584 return (const UChar *)mapping+1;
585 }
586 }
587}
588
589// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
590// so that a raw mapping fits that consists of one unit ("rm0")
591// plus all but the first two code units of the normal mapping.
592// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
593const UChar *
594Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
595 // We do not loop in this method because an algorithmic mapping itself
596 // becomes a final result rather than having to be decomposed recursively.
597 uint16_t norm16;
598 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
599 // c does not decompose
600 return NULL;
601 } else if(isHangul(norm16)) {
602 // Hangul syllable: decompose algorithmically
603 Hangul::getRawDecomposition(c, buffer);
604 length=2;
605 return buffer;
606 } else if(isDecompNoAlgorithmic(norm16)) {
607 c=mapAlgorithmic(c, norm16);
608 length=0;
609 U16_APPEND_UNSAFE(buffer, length, c);
610 return buffer;
611 } else {
612 // c decomposes, get everything from the variable-length extra data
613 const uint16_t *mapping=getMapping(norm16);
614 uint16_t firstUnit=*mapping;
615 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
616 if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
617 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
618 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
619 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
620 uint16_t rm0=*rawMapping;
621 if(rm0<=MAPPING_LENGTH_MASK) {
622 length=rm0;
623 return (const UChar *)rawMapping-rm0;
624 } else {
625 // Copy the normal mapping and replace its first two code units with rm0.
626 buffer[0]=(UChar)rm0;
627 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
628 length=mLength-1;
629 return buffer;
630 }
631 } else {
632 length=mLength;
633 return (const UChar *)mapping+1;
634 }
635 }
636}
637
638void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
639 UBool doDecompose,
640 UnicodeString &safeMiddle,
641 ReorderingBuffer &buffer,
642 UErrorCode &errorCode) const {
643 buffer.copyReorderableSuffixTo(safeMiddle);
644 if(doDecompose) {
645 decompose(src, limit, &buffer, errorCode);
646 return;
647 }
648 // Just merge the strings at the boundary.
649 ForwardUTrie2StringIterator iter(normTrie, src, limit);
650 uint8_t firstCC, prevCC, cc;
651 firstCC=prevCC=cc=getCC(iter.next16());
652 while(cc!=0) {
653 prevCC=cc;
654 cc=getCC(iter.next16());
655 };
656 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
657 limit=u_strchr(iter.codePointStart, 0);
658 }
659
660 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
661 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
662 }
663}
664
665// Note: hasDecompBoundary() could be implemented as aliases to
666// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
667// at the cost of building the FCD trie for a decomposition normalizer.
668UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
669 for(;;) {
670 if(c<minDecompNoCP) {
671 return TRUE;
672 }
673 uint16_t norm16=getNorm16(c);
674 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
675 return TRUE;
676 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
677 return FALSE; // ccc!=0
678 } else if(isDecompNoAlgorithmic(norm16)) {
679 c=mapAlgorithmic(c, norm16);
680 } else {
681 // c decomposes, get everything from the variable-length extra data
682 const uint16_t *mapping=getMapping(norm16);
683 uint16_t firstUnit=*mapping;
684 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
685 return FALSE;
686 }
687 if(!before) {
688 // decomp after-boundary: same as hasFCDBoundaryAfter(),
689 // fcd16<=1 || trailCC==0
690 if(firstUnit>0x1ff) {
691 return FALSE; // trailCC>1
692 }
693 if(firstUnit<=0xff) {
694 return TRUE; // trailCC==0
695 }
696 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
697 }
698 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
699 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
700 }
701 }
702}
703
704/*
705 * Finds the recomposition result for
706 * a forward-combining "lead" character,
707 * specified with a pointer to its compositions list,
708 * and a backward-combining "trail" character.
709 *
710 * If the lead and trail characters combine, then this function returns
711 * the following "compositeAndFwd" value:
712 * Bits 21..1 composite character
713 * Bit 0 set if the composite is a forward-combining starter
714 * otherwise it returns -1.
715 *
716 * The compositions list has (trail, compositeAndFwd) pair entries,
717 * encoded as either pairs or triples of 16-bit units.
718 * The last entry has the high bit of its first unit set.
719 *
720 * The list is sorted by ascending trail characters (there are no duplicates).
721 * A linear search is used.
722 *
723 * See normalizer2impl.h for a more detailed description
724 * of the compositions list format.
725 */
726int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
727 uint16_t key1, firstUnit;
728 if(trail<COMP_1_TRAIL_LIMIT) {
729 // trail character is 0..33FF
730 // result entry may have 2 or 3 units
731 key1=(uint16_t)(trail<<1);
732 while(key1>(firstUnit=*list)) {
733 list+=2+(firstUnit&COMP_1_TRIPLE);
734 }
735 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
736 if(firstUnit&COMP_1_TRIPLE) {
737 return ((int32_t)list[1]<<16)|list[2];
738 } else {
739 return list[1];
740 }
741 }
742 } else {
743 // trail character is 3400..10FFFF
744 // result entry has 3 units
745 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
746 (((trail>>COMP_1_TRAIL_SHIFT))&
747 ~COMP_1_TRIPLE));
748 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
749 uint16_t secondUnit;
750 for(;;) {
751 if(key1>(firstUnit=*list)) {
752 list+=2+(firstUnit&COMP_1_TRIPLE);
753 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
754 if(key2>(secondUnit=list[1])) {
755 if(firstUnit&COMP_1_LAST_TUPLE) {
756 break;
757 } else {
758 list+=3;
759 }
760 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
761 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
762 } else {
763 break;
764 }
765 } else {
766 break;
767 }
768 }
769 }
770 return -1;
771}
772
773/**
774 * @param list some character's compositions list
775 * @param set recursively receives the composites from these compositions
776 */
777void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
778 uint16_t firstUnit;
779 int32_t compositeAndFwd;
780 do {
781 firstUnit=*list;
782 if((firstUnit&COMP_1_TRIPLE)==0) {
783 compositeAndFwd=list[1];
784 list+=2;
785 } else {
786 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
787 list+=3;
788 }
789 UChar32 composite=compositeAndFwd>>1;
790 if((compositeAndFwd&1)!=0) {
791 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
792 }
793 set.add(composite);
794 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
795}
796
797/*
798 * Recomposes the buffer text starting at recomposeStartIndex
799 * (which is in NFD - decomposed and canonically ordered),
800 * and truncates the buffer contents.
801 *
802 * Note that recomposition never lengthens the text:
803 * Any character consists of either one or two code units;
804 * a composition may contain at most one more code unit than the original starter,
805 * while the combining mark that is removed has at least one code unit.
806 */
807void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
808 UBool onlyContiguous) const {
809 UChar *p=buffer.getStart()+recomposeStartIndex;
810 UChar *limit=buffer.getLimit();
811 if(p==limit) {
812 return;
813 }
814
815 UChar *starter, *pRemove, *q, *r;
816 const uint16_t *compositionsList;
817 UChar32 c, compositeAndFwd;
818 uint16_t norm16;
819 uint8_t cc, prevCC;
820 UBool starterIsSupplementary;
821
822 // Some of the following variables are not used until we have a forward-combining starter
823 // and are only initialized now to avoid compiler warnings.
824 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
825 starter=NULL;
826 starterIsSupplementary=FALSE;
827 prevCC=0;
828
829 for(;;) {
830 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
831 cc=getCCFromYesOrMaybe(norm16);
832 if( // this character combines backward and
833 isMaybe(norm16) &&
834 // we have seen a starter that combines forward and
835 compositionsList!=NULL &&
836 // the backward-combining character is not blocked
837 (prevCC<cc || prevCC==0)
838 ) {
839 if(isJamoVT(norm16)) {
840 // c is a Jamo V/T, see if we can compose it with the previous character.
841 if(c<Hangul::JAMO_T_BASE) {
842 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
843 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
844 if(prev<Hangul::JAMO_L_COUNT) {
845 pRemove=p-1;
846 UChar syllable=(UChar)
847 (Hangul::HANGUL_BASE+
848 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
849 Hangul::JAMO_T_COUNT);
850 UChar t;
851 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
852 ++p;
853 syllable+=t; // The next character was a Jamo T.
854 }
855 *starter=syllable;
856 // remove the Jamo V/T
857 q=pRemove;
858 r=p;
859 while(r<limit) {
860 *q++=*r++;
861 }
862 limit=q;
863 p=pRemove;
864 }
865 }
866 /*
867 * No "else" for Jamo T:
868 * Since the input is in NFD, there are no Hangul LV syllables that
869 * a Jamo T could combine with.
870 * All Jamo Ts are combined above when handling Jamo Vs.
871 */
872 if(p==limit) {
873 break;
874 }
875 compositionsList=NULL;
876 continue;
877 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
878 // The starter and the combining mark (c) do combine.
879 UChar32 composite=compositeAndFwd>>1;
880
881 // Replace the starter with the composite, remove the combining mark.
882 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
883 if(starterIsSupplementary) {
884 if(U_IS_SUPPLEMENTARY(composite)) {
885 // both are supplementary
886 starter[0]=U16_LEAD(composite);
887 starter[1]=U16_TRAIL(composite);
888 } else {
889 *starter=(UChar)composite;
890 // The composite is shorter than the starter,
891 // move the intermediate characters forward one.
892 starterIsSupplementary=FALSE;
893 q=starter+1;
894 r=q+1;
895 while(r<pRemove) {
896 *q++=*r++;
897 }
898 --pRemove;
899 }
900 } else if(U_IS_SUPPLEMENTARY(composite)) {
901 // The composite is longer than the starter,
902 // move the intermediate characters back one.
903 starterIsSupplementary=TRUE;
904 ++starter; // temporarily increment for the loop boundary
905 q=pRemove;
906 r=++pRemove;
907 while(starter<q) {
908 *--r=*--q;
909 }
910 *starter=U16_TRAIL(composite);
911 *--starter=U16_LEAD(composite); // undo the temporary increment
912 } else {
913 // both are on the BMP
914 *starter=(UChar)composite;
915 }
916
917 /* remove the combining mark by moving the following text over it */
918 if(pRemove<p) {
919 q=pRemove;
920 r=p;
921 while(r<limit) {
922 *q++=*r++;
923 }
924 limit=q;
925 p=pRemove;
926 }
927 // Keep prevCC because we removed the combining mark.
928
929 if(p==limit) {
930 break;
931 }
932 // Is the composite a starter that combines forward?
933 if(compositeAndFwd&1) {
934 compositionsList=
935 getCompositionsListForComposite(getNorm16(composite));
936 } else {
937 compositionsList=NULL;
938 }
939
940 // We combined; continue with looking for compositions.
941 continue;
942 }
943 }
944
945 // no combination this time
946 prevCC=cc;
947 if(p==limit) {
948 break;
949 }
950
951 // If c did not combine, then check if it is a starter.
952 if(cc==0) {
953 // Found a new starter.
954 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
955 // It may combine with something, prepare for it.
956 if(U_IS_BMP(c)) {
957 starterIsSupplementary=FALSE;
958 starter=p-1;
959 } else {
960 starterIsSupplementary=TRUE;
961 starter=p-2;
962 }
963 }
964 } else if(onlyContiguous) {
965 // FCC: no discontiguous compositions; any intervening character blocks.
966 compositionsList=NULL;
967 }
968 }
969 buffer.setReorderingLimit(limit);
970}
971
972UChar32
973Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
974 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
975 const uint16_t *list;
976 if(isInert(norm16)) {
977 return U_SENTINEL;
978 } else if(norm16<minYesNoMappingsOnly) {
979 if(isJamoL(norm16)) {
980 b-=Hangul::JAMO_V_BASE;
981 if(0<=b && b<Hangul::JAMO_V_COUNT) {
982 return
983 (Hangul::HANGUL_BASE+
984 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
985 Hangul::JAMO_T_COUNT);
986 } else {
987 return U_SENTINEL;
988 }
989 } else if(isHangul(norm16)) {
990 b-=Hangul::JAMO_T_BASE;
991 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
992 return a+b;
993 } else {
994 return U_SENTINEL;
995 }
996 } else {
997 // 'a' has a compositions list in extraData
998 list=extraData+norm16;
999 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
1000 list+= // mapping pointer
1001 1+ // +1 to skip the first unit with the mapping lenth
1002 (*list&MAPPING_LENGTH_MASK); // + mapping length
1003 }
1004 }
1005 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1006 return U_SENTINEL;
1007 } else {
1008 list=maybeYesCompositions+norm16-minMaybeYes;
1009 }
1010 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
1011 return U_SENTINEL;
1012 }
1013#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1014 return combine(list, b)>>1;
1015#else
1016 int32_t compositeAndFwd=combine(list, b);
1017 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1018#endif
1019}
1020
1021// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1022// doCompose: normalize
1023// !doCompose: isNormalized (buffer must be empty and initialized)
1024UBool
1025Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1026 UBool onlyContiguous,
1027 UBool doCompose,
1028 ReorderingBuffer &buffer,
1029 UErrorCode &errorCode) const {
1030 /*
1031 * prevBoundary points to the last character before the current one
1032 * that has a composition boundary before it with ccc==0 and quick check "yes".
1033 * Keeping track of prevBoundary saves us looking for a composition boundary
1034 * when we find a "no" or "maybe".
1035 *
1036 * When we back out from prevSrc back to prevBoundary,
1037 * then we also remove those same characters (which had been simply copied
1038 * or canonically-order-inserted) from the ReorderingBuffer.
1039 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1040 * must correspond 1:1 to destination units at the end of the destination buffer.
1041 */
1042 const UChar *prevBoundary=src;
1043 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1044 if(limit==NULL) {
1045 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1046 doCompose ? &buffer : NULL,
1047 errorCode);
1048 if(U_FAILURE(errorCode)) {
1049 return FALSE;
1050 }
1051 if(prevBoundary<src) {
1052 // Set prevBoundary to the last character in the prefix.
1053 prevBoundary=src-1;
1054 }
1055 limit=u_strchr(src, 0);
1056 }
1057
1058 const UChar *prevSrc;
1059 UChar32 c=0;
1060 uint16_t norm16=0;
1061
1062 // only for isNormalized
1063 uint8_t prevCC=0;
1064
1065 for(;;) {
1066 // count code units below the minimum or with irrelevant data for the quick check
1067 for(prevSrc=src; src!=limit;) {
1068 if( (c=*src)<minNoMaybeCP ||
1069 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1070 ) {
1071 ++src;
1072 } else if(!U16_IS_SURROGATE(c)) {
1073 break;
1074 } else {
1075 UChar c2;
1076 if(U16_IS_SURROGATE_LEAD(c)) {
1077 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1078 c=U16_GET_SUPPLEMENTARY(c, c2);
1079 }
1080 } else /* trail surrogate */ {
1081 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1082 --src;
1083 c=U16_GET_SUPPLEMENTARY(c2, c);
1084 }
1085 }
1086 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1087 src+=U16_LENGTH(c);
1088 } else {
1089 break;
1090 }
1091 }
1092 }
1093 // copy these code units all at once
1094 if(src!=prevSrc) {
1095 if(doCompose) {
1096 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
1097 break;
1098 }
1099 } else {
1100 prevCC=0;
1101 }
1102 if(src==limit) {
1103 break;
1104 }
1105 // Set prevBoundary to the last character in the quick check loop.
1106 prevBoundary=src-1;
1107 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1108 U16_IS_LEAD(*(prevBoundary-1))
1109 ) {
1110 --prevBoundary;
1111 }
1112 // The start of the current character (c).
1113 prevSrc=src;
1114 } else if(src==limit) {
1115 break;
1116 }
1117
1118 src+=U16_LENGTH(c);
1119 /*
1120 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1121 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1122 * or has ccc!=0.
1123 * Check for Jamo V/T, then for regular characters.
1124 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1125 */
1126 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1127 UChar prev=*(prevSrc-1);
1128 UBool needToDecompose=FALSE;
1129 if(c<Hangul::JAMO_T_BASE) {
1130 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1131 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1132 if(prev<Hangul::JAMO_L_COUNT) {
1133 if(!doCompose) {
1134 return FALSE;
1135 }
1136 UChar syllable=(UChar)
1137 (Hangul::HANGUL_BASE+
1138 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1139 Hangul::JAMO_T_COUNT);
1140 UChar t;
1141 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1142 ++src;
1143 syllable+=t; // The next character was a Jamo T.
1144 prevBoundary=src;
1145 buffer.setLastChar(syllable);
1146 continue;
1147 }
1148 // If we see L+V+x where x!=T then we drop to the slow path,
1149 // decompose and recompose.
1150 // This is to deal with NFKC finding normal L and V but a
1151 // compatibility variant of a T. We need to either fully compose that
1152 // combination here (which would complicate the code and may not work
1153 // with strange custom data) or use the slow path -- or else our replacing
1154 // two input characters (L+V) with one output character (LV syllable)
1155 // would violate the invariant that [prevBoundary..prevSrc[ has the same
1156 // length as what we appended to the buffer since prevBoundary.
1157 needToDecompose=TRUE;
1158 }
1159 } else if(Hangul::isHangulWithoutJamoT(prev)) {
1160 // c is a Jamo Trailing consonant,
1161 // compose with previous Hangul LV that does not contain a Jamo T.
1162 if(!doCompose) {
1163 return FALSE;
1164 }
1165 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1166 prevBoundary=src;
1167 continue;
1168 }
1169 if(!needToDecompose) {
1170 // The Jamo V/T did not compose into a Hangul syllable.
1171 if(doCompose) {
1172 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1173 break;
1174 }
1175 } else {
1176 prevCC=0;
1177 }
1178 continue;
1179 }
1180 }
1181 /*
1182 * Source buffer pointers:
1183 *
1184 * all done quick check current char not yet
1185 * "yes" but (c) processed
1186 * may combine
1187 * forward
1188 * [-------------[-------------[-------------[-------------[
1189 * | | | | |
1190 * orig. src prevBoundary prevSrc src limit
1191 *
1192 *
1193 * Destination buffer pointers inside the ReorderingBuffer:
1194 *
1195 * all done might take not filled yet
1196 * characters for
1197 * reordering
1198 * [-------------[-------------[-------------[
1199 * | | | |
1200 * start reorderStart limit |
1201 * +remainingCap.+
1202 */
1203 if(norm16>=MIN_YES_YES_WITH_CC) {
1204 uint8_t cc=(uint8_t)norm16; // cc!=0
1205 if( onlyContiguous && // FCC
1206 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1207 prevBoundary<prevSrc &&
1208 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1209 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1210 // passed the quick check "yes && ccc==0" test.
1211 // Check whether the last character was a "yesYes" or a "yesNo".
1212 // If a "yesNo", then we get its trailing ccc from its
1213 // mapping and check for canonical order.
1214 // All other cases are ok.
1215 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1216 ) {
1217 // Fails FCD test, need to decompose and contiguously recompose.
1218 if(!doCompose) {
1219 return FALSE;
1220 }
1221 } else if(doCompose) {
1222 if(!buffer.append(c, cc, errorCode)) {
1223 break;
1224 }
1225 continue;
1226 } else if(prevCC<=cc) {
1227 prevCC=cc;
1228 continue;
1229 } else {
1230 return FALSE;
1231 }
1232 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1233 return FALSE;
1234 }
1235
1236 /*
1237 * Find appropriate boundaries around this character,
1238 * decompose the source text from between the boundaries,
1239 * and recompose it.
1240 *
1241 * We may need to remove the last few characters from the ReorderingBuffer
1242 * to account for source text that was copied or appended
1243 * but needs to take part in the recomposition.
1244 */
1245
1246 /*
1247 * Find the last composition boundary in [prevBoundary..src[.
1248 * It is either the decomposition of the current character (at prevSrc),
1249 * or prevBoundary.
1250 */
1251 if(hasCompBoundaryBefore(c, norm16)) {
1252 prevBoundary=prevSrc;
1253 } else if(doCompose) {
1254 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1255 }
1256
1257 // Find the next composition boundary in [src..limit[ -
1258 // modifies src to point to the next starter.
1259 src=(UChar *)findNextCompBoundary(src, limit);
1260
1261 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1262 int32_t recomposeStartIndex=buffer.length();
1263 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1264 break;
1265 }
1266 recompose(buffer, recomposeStartIndex, onlyContiguous);
1267 if(!doCompose) {
1268 if(!buffer.equals(prevBoundary, src)) {
1269 return FALSE;
1270 }
1271 buffer.remove();
1272 prevCC=0;
1273 }
1274
1275 // Move to the next starter. We never need to look back before this point again.
1276 prevBoundary=src;
1277 }
1278 return TRUE;
1279}
1280
1281// Very similar to compose(): Make the same changes in both places if relevant.
1282// pQCResult==NULL: spanQuickCheckYes
1283// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1284const UChar *
1285Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1286 UBool onlyContiguous,
1287 UNormalizationCheckResult *pQCResult) const {
1288 /*
1289 * prevBoundary points to the last character before the current one
1290 * that has a composition boundary before it with ccc==0 and quick check "yes".
1291 */
1292 const UChar *prevBoundary=src;
1293 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1294 if(limit==NULL) {
1295 UErrorCode errorCode=U_ZERO_ERROR;
1296 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1297 if(prevBoundary<src) {
1298 // Set prevBoundary to the last character in the prefix.
1299 prevBoundary=src-1;
1300 }
1301 limit=u_strchr(src, 0);
1302 }
1303
1304 const UChar *prevSrc;
1305 UChar32 c=0;
1306 uint16_t norm16=0;
1307 uint8_t prevCC=0;
1308
1309 for(;;) {
1310 // count code units below the minimum or with irrelevant data for the quick check
1311 for(prevSrc=src;;) {
1312 if(src==limit) {
1313 return src;
1314 }
1315 if( (c=*src)<minNoMaybeCP ||
1316 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1317 ) {
1318 ++src;
1319 } else if(!U16_IS_SURROGATE(c)) {
1320 break;
1321 } else {
1322 UChar c2;
1323 if(U16_IS_SURROGATE_LEAD(c)) {
1324 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1325 c=U16_GET_SUPPLEMENTARY(c, c2);
1326 }
1327 } else /* trail surrogate */ {
1328 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1329 --src;
1330 c=U16_GET_SUPPLEMENTARY(c2, c);
1331 }
1332 }
1333 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1334 src+=U16_LENGTH(c);
1335 } else {
1336 break;
1337 }
1338 }
1339 }
1340 if(src!=prevSrc) {
1341 // Set prevBoundary to the last character in the quick check loop.
1342 prevBoundary=src-1;
1343 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1344 U16_IS_LEAD(*(prevBoundary-1))
1345 ) {
1346 --prevBoundary;
1347 }
1348 prevCC=0;
1349 // The start of the current character (c).
1350 prevSrc=src;
1351 }
1352
1353 src+=U16_LENGTH(c);
1354 /*
1355 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1356 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1357 * or has ccc!=0.
1358 */
1359 if(isMaybeOrNonZeroCC(norm16)) {
1360 uint8_t cc=getCCFromYesOrMaybe(norm16);
1361 if( onlyContiguous && // FCC
1362 cc!=0 &&
1363 prevCC==0 &&
1364 prevBoundary<prevSrc &&
1365 // prevCC==0 && prevBoundary<prevSrc tell us that
1366 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1367 // passed the quick check "yes && ccc==0" test.
1368 // Check whether the last character was a "yesYes" or a "yesNo".
1369 // If a "yesNo", then we get its trailing ccc from its
1370 // mapping and check for canonical order.
1371 // All other cases are ok.
1372 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1373 ) {
1374 // Fails FCD test.
1375 } else if(prevCC<=cc || cc==0) {
1376 prevCC=cc;
1377 if(norm16<MIN_YES_YES_WITH_CC) {
1378 if(pQCResult!=NULL) {
1379 *pQCResult=UNORM_MAYBE;
1380 } else {
1381 return prevBoundary;
1382 }
1383 }
1384 continue;
1385 }
1386 }
1387 if(pQCResult!=NULL) {
1388 *pQCResult=UNORM_NO;
1389 }
1390 return prevBoundary;
1391 }
1392}
1393
1394void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1395 UBool doCompose,
1396 UBool onlyContiguous,
1397 UnicodeString &safeMiddle,
1398 ReorderingBuffer &buffer,
1399 UErrorCode &errorCode) const {
1400 if(!buffer.isEmpty()) {
1401 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1402 if(src!=firstStarterInSrc) {
1403 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1404 buffer.getLimit());
1405 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1406 UnicodeString middle(lastStarterInDest, destSuffixLength);
1407 buffer.removeSuffix(destSuffixLength);
1408 safeMiddle=middle;
1409 middle.append(src, (int32_t)(firstStarterInSrc-src));
1410 const UChar *middleStart=middle.getBuffer();
1411 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1412 TRUE, buffer, errorCode);
1413 if(U_FAILURE(errorCode)) {
1414 return;
1415 }
1416 src=firstStarterInSrc;
1417 }
1418 }
1419 if(doCompose) {
1420 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1421 } else {
1422 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1423 limit=u_strchr(src, 0);
1424 }
1425 buffer.appendZeroCC(src, limit, errorCode);
1426 }
1427}
1428
1429/**
1430 * Does c have a composition boundary before it?
1431 * True if its decomposition begins with a character that has
1432 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1433 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1434 * (isCompYesAndZeroCC()) so we need not decompose.
1435 */
1436UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1437 for(;;) {
1438 if(isCompYesAndZeroCC(norm16)) {
1439 return TRUE;
1440 } else if(isMaybeOrNonZeroCC(norm16)) {
1441 return FALSE;
1442 } else if(isDecompNoAlgorithmic(norm16)) {
1443 c=mapAlgorithmic(c, norm16);
1444 norm16=getNorm16(c);
1445 } else {
1446 // c decomposes, get everything from the variable-length extra data
1447 const uint16_t *mapping=getMapping(norm16);
1448 uint16_t firstUnit=*mapping;
1449 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1450 return FALSE;
1451 }
1452 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
1453 return FALSE; // non-zero leadCC
1454 }
1455 int32_t i=1; // skip over the firstUnit
1456 UChar32 c;
1457 U16_NEXT_UNSAFE(mapping, i, c);
1458 return isCompYesAndZeroCC(getNorm16(c));
1459 }
1460 }
1461}
1462
1463UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1464 for(;;) {
1465 uint16_t norm16=getNorm16(c);
1466 if(isInert(norm16)) {
1467 return TRUE;
1468 } else if(norm16<=minYesNo) {
1469 // Hangul: norm16==minYesNo
1470 // Hangul LVT has a boundary after it.
1471 // Hangul LV and non-inert yesYes characters combine forward.
1472 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1473 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1474 return FALSE;
1475 } else if(isDecompNoAlgorithmic(norm16)) {
1476 c=mapAlgorithmic(c, norm16);
1477 } else {
1478 // c decomposes, get everything from the variable-length extra data.
1479 // If testInert, then c must be a yesNo character which has lccc=0,
1480 // otherwise it could be a noNo.
1481 const uint16_t *mapping=getMapping(norm16);
1482 uint16_t firstUnit=*mapping;
1483 // TRUE if
1484 // not MAPPING_NO_COMP_BOUNDARY_AFTER
1485 // (which is set if
1486 // c is not deleted, and
1487 // it and its decomposition do not combine forward, and it has a starter)
1488 // and if FCC then trailCC<=1
1489 return
1490 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1491 (!onlyContiguous || firstUnit<=0x1ff);
1492 }
1493 }
1494}
1495
1496const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1497 BackwardUTrie2StringIterator iter(normTrie, start, p);
1498 uint16_t norm16;
1499 do {
1500 norm16=iter.previous16();
1501 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1502 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1503 // but that's probably not worth the extra cost.
1504 return iter.codePointStart;
1505}
1506
1507const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1508 ForwardUTrie2StringIterator iter(normTrie, p, limit);
1509 uint16_t norm16;
1510 do {
1511 norm16=iter.next16();
1512 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1513 return iter.codePointStart;
1514}
1515
1516// Note: normalizer2impl.cpp r30982 (2011-nov-27)
1517// still had getFCDTrie() which built and cached an FCD trie.
1518// That provided faster access to FCD data than getFCD16FromNormData()
1519// but required synchronization and consumed some 10kB of heap memory
1520// in any process that uses FCD (e.g., via collation).
1521// tccc180[] and smallFCD[] are intended to help with any loss of performance,
1522// at least for Latin & CJK.
1523
1524// Gets the FCD value from the regular normalization data.
1525uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
1526 // Only loops for 1:1 algorithmic mappings.
1527 for(;;) {
1528 uint16_t norm16=getNorm16(c);
1529 if(norm16<=minYesNo) {
1530 // no decomposition or Hangul syllable, all zeros
1531 return 0;
1532 } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
1533 // combining mark
1534 norm16&=0xff;
1535 return norm16|(norm16<<8);
1536 } else if(norm16>=minMaybeYes) {
1537 return 0;
1538 } else if(isDecompNoAlgorithmic(norm16)) {
1539 c=mapAlgorithmic(c, norm16);
1540 } else {
1541 // c decomposes, get everything from the variable-length extra data
1542 const uint16_t *mapping=getMapping(norm16);
1543 uint16_t firstUnit=*mapping;
1544 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1545 // A character that is deleted (maps to an empty string) must
1546 // get the worst-case lccc and tccc values because arbitrary
1547 // characters on both sides will become adjacent.
1548 return 0x1ff;
1549 } else {
1550 norm16=firstUnit>>8; // tccc
1551 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1552 norm16|=*(mapping-1)&0xff00; // lccc
1553 }
1554 return norm16;
1555 }
1556 }
1557 }
1558}
1559
1560// Dual functionality:
1561// buffer!=NULL: normalize
1562// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1563const UChar *
1564Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1565 ReorderingBuffer *buffer,
1566 UErrorCode &errorCode) const {
1567 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1568 // Similar to the prevBoundary in the compose() implementation.
1569 const UChar *prevBoundary=src;
1570 int32_t prevFCD16=0;
1571 if(limit==NULL) {
1572 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1573 if(U_FAILURE(errorCode)) {
1574 return src;
1575 }
1576 if(prevBoundary<src) {
1577 prevBoundary=src;
1578 // We know that the previous character's lccc==0.
1579 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1580 prevFCD16=getFCD16(*(src-1));
1581 if(prevFCD16>1) {
1582 --prevBoundary;
1583 }
1584 }
1585 limit=u_strchr(src, 0);
1586 }
1587
1588 // Note: In this function we use buffer->appendZeroCC() because we track
1589 // the lead and trail combining classes here, rather than leaving it to
1590 // the ReorderingBuffer.
1591 // The exception is the call to decomposeShort() which uses the buffer
1592 // in the normal way.
1593
1594 const UChar *prevSrc;
1595 UChar32 c=0;
1596 uint16_t fcd16=0;
1597
1598 for(;;) {
1599 // count code units with lccc==0
1600 for(prevSrc=src; src!=limit;) {
1601 if((c=*src)<MIN_CCC_LCCC_CP) {
1602 prevFCD16=~c;
1603 ++src;
1604 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1605 prevFCD16=0;
1606 ++src;
1607 } else {
1608 if(U16_IS_SURROGATE(c)) {
1609 UChar c2;
1610 if(U16_IS_SURROGATE_LEAD(c)) {
1611 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1612 c=U16_GET_SUPPLEMENTARY(c, c2);
1613 }
1614 } else /* trail surrogate */ {
1615 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1616 --src;
1617 c=U16_GET_SUPPLEMENTARY(c2, c);
1618 }
1619 }
1620 }
1621 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1622 prevFCD16=fcd16;
1623 src+=U16_LENGTH(c);
1624 } else {
1625 break;
1626 }
1627 }
1628 }
1629 // copy these code units all at once
1630 if(src!=prevSrc) {
1631 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1632 break;
1633 }
1634 if(src==limit) {
1635 break;
1636 }
1637 prevBoundary=src;
1638 // We know that the previous character's lccc==0.
1639 if(prevFCD16<0) {
1640 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1641 UChar32 prev=~prevFCD16;
1642 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1643 if(prevFCD16>1) {
1644 --prevBoundary;
1645 }
1646 } else {
1647 const UChar *p=src-1;
1648 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1649 --p;
1650 // Need to fetch the previous character's FCD value because
1651 // prevFCD16 was just for the trail surrogate code point.
1652 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
1653 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1654 }
1655 if(prevFCD16>1) {
1656 prevBoundary=p;
1657 }
1658 }
1659 // The start of the current character (c).
1660 prevSrc=src;
1661 } else if(src==limit) {
1662 break;
1663 }
1664
1665 src+=U16_LENGTH(c);
1666 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1667 // Check for proper order, and decompose locally if necessary.
1668 if((prevFCD16&0xff)<=(fcd16>>8)) {
1669 // proper order: prev tccc <= current lccc
1670 if((fcd16&0xff)<=1) {
1671 prevBoundary=src;
1672 }
1673 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1674 break;
1675 }
1676 prevFCD16=fcd16;
1677 continue;
1678 } else if(buffer==NULL) {
1679 return prevBoundary; // quick check "no"
1680 } else {
1681 /*
1682 * Back out the part of the source that we copied or appended
1683 * already but is now going to be decomposed.
1684 * prevSrc is set to after what was copied/appended.
1685 */
1686 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1687 /*
1688 * Find the part of the source that needs to be decomposed,
1689 * up to the next safe boundary.
1690 */
1691 src=findNextFCDBoundary(src, limit);
1692 /*
1693 * The source text does not fulfill the conditions for FCD.
1694 * Decompose and reorder a limited piece of the text.
1695 */
1696 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1697 break;
1698 }
1699 prevBoundary=src;
1700 prevFCD16=0;
1701 }
1702 }
1703 return src;
1704}
1705
1706void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1707 UBool doMakeFCD,
1708 UnicodeString &safeMiddle,
1709 ReorderingBuffer &buffer,
1710 UErrorCode &errorCode) const {
1711 if(!buffer.isEmpty()) {
1712 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1713 if(src!=firstBoundaryInSrc) {
1714 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1715 buffer.getLimit());
1716 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1717 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1718 buffer.removeSuffix(destSuffixLength);
1719 safeMiddle=middle;
1720 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1721 const UChar *middleStart=middle.getBuffer();
1722 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1723 if(U_FAILURE(errorCode)) {
1724 return;
1725 }
1726 src=firstBoundaryInSrc;
1727 }
1728 }
1729 if(doMakeFCD) {
1730 makeFCD(src, limit, &buffer, errorCode);
1731 } else {
1732 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1733 limit=u_strchr(src, 0);
1734 }
1735 buffer.appendZeroCC(src, limit, errorCode);
1736 }
1737}
1738
1739const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1740 while(start<p && previousFCD16(start, p)>0xff) {}
1741 return p;
1742}
1743
1744const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1745 while(p<limit) {
1746 const UChar *codePointStart=p;
1747 if(nextFCD16(p, limit)<=0xff) {
1748 return codePointStart;
1749 }
1750 }
1751 return p;
1752}
1753
1754// CanonicalIterator data -------------------------------------------------- ***
1755
1756CanonIterData::CanonIterData(UErrorCode &errorCode) :
1757 trie(utrie2_open(0, 0, &errorCode)),
1758 canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
1759
1760CanonIterData::~CanonIterData() {
1761 utrie2_close(trie);
1762}
1763
1764void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1765 uint32_t canonValue=utrie2_get32(trie, decompLead);
1766 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1767 // origin is the first character whose decomposition starts with
1768 // the character for which we are setting the value.
1769 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1770 } else {
1771 // origin is not the first character, or it is U+0000.
1772 UnicodeSet *set;
1773 if((canonValue&CANON_HAS_SET)==0) {
1774 set=new UnicodeSet;
1775 if(set==NULL) {
1776 errorCode=U_MEMORY_ALLOCATION_ERROR;
1777 return;
1778 }
1779 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1780 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1781 utrie2_set32(trie, decompLead, canonValue, &errorCode);
1782 canonStartSets.addElement(set, errorCode);
1783 if(firstOrigin!=0) {
1784 set->add(firstOrigin);
1785 }
1786 } else {
1787 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1788 }
1789 set->add(origin);
1790 }
1791}
1792
1793U_CDECL_BEGIN
1794
1795// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1796// context: the Normalizer2Impl
1797static UBool U_CALLCONV
1798enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1799 UErrorCode errorCode = U_ZERO_ERROR;
1800 if (value != 0) {
1801 Normalizer2Impl *impl = (Normalizer2Impl *)context;
1802 impl->makeCanonIterDataFromNorm16(
1803 start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
1804 }
1805 return U_SUCCESS(errorCode);
1806}
1807
1808
1809
1810// UInitOnce instantiation function for CanonIterData
1811
1812static void U_CALLCONV
1813initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
1814 U_ASSERT(impl->fCanonIterData == NULL);
1815 impl->fCanonIterData = new CanonIterData(errorCode);
1816 if (impl->fCanonIterData == NULL) {
1817 errorCode=U_MEMORY_ALLOCATION_ERROR;
1818 }
1819 if (U_SUCCESS(errorCode)) {
1820 utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
1821 utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1822 }
1823 if (U_FAILURE(errorCode)) {
1824 delete impl->fCanonIterData;
1825 impl->fCanonIterData = NULL;
1826 }
1827}
1828
1829U_CDECL_END
1830
1831void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1832 CanonIterData &newData,
1833 UErrorCode &errorCode) const {
1834 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1835 // Inert, or 2-way mapping (including Hangul syllable).
1836 // We do not write a canonStartSet for any yesNo character.
1837 // Composites from 2-way mappings are added at runtime from the
1838 // starter's compositions list, and the other characters in
1839 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1840 // "maybe" characters.
1841 return;
1842 }
1843 for(UChar32 c=start; c<=end; ++c) {
1844 uint32_t oldValue=utrie2_get32(newData.trie, c);
1845 uint32_t newValue=oldValue;
1846 if(norm16>=minMaybeYes) {
1847 // not a segment starter if it occurs in a decomposition or has cc!=0
1848 newValue|=CANON_NOT_SEGMENT_STARTER;
1849 if(norm16<MIN_NORMAL_MAYBE_YES) {
1850 newValue|=CANON_HAS_COMPOSITIONS;
1851 }
1852 } else if(norm16<minYesNo) {
1853 newValue|=CANON_HAS_COMPOSITIONS;
1854 } else {
1855 // c has a one-way decomposition
1856 UChar32 c2=c;
1857 uint16_t norm16_2=norm16;
1858 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1859 c2=mapAlgorithmic(c2, norm16_2);
1860 norm16_2=getNorm16(c2);
1861 }
1862 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1863 // c decomposes, get everything from the variable-length extra data
1864 const uint16_t *mapping=getMapping(norm16_2);
1865 uint16_t firstUnit=*mapping;
1866 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1867 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1868 if(c==c2 && (*(mapping-1)&0xff)!=0) {
1869 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
1870 }
1871 }
1872 // Skip empty mappings (no characters in the decomposition).
1873 if(length!=0) {
1874 ++mapping; // skip over the firstUnit
1875 // add c to first code point's start set
1876 int32_t i=0;
1877 U16_NEXT_UNSAFE(mapping, i, c2);
1878 newData.addToStartSet(c, c2, errorCode);
1879 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1880 // one-way mapping. A 2-way mapping is possible here after
1881 // intermediate algorithmic mapping.
1882 if(norm16_2>=minNoNo) {
1883 while(i<length) {
1884 U16_NEXT_UNSAFE(mapping, i, c2);
1885 uint32_t c2Value=utrie2_get32(newData.trie, c2);
1886 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1887 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1888 &errorCode);
1889 }
1890 }
1891 }
1892 }
1893 } else {
1894 // c decomposed to c2 algorithmically; c has cc==0
1895 newData.addToStartSet(c, c2, errorCode);
1896 }
1897 }
1898 if(newValue!=oldValue) {
1899 utrie2_set32(newData.trie, c, newValue, &errorCode);
1900 }
1901 }
1902}
1903
1904UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1905 // Logically const: Synchronized instantiation.
1906 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1907 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
1908 return U_SUCCESS(errorCode);
1909}
1910
1911int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1912 return (int32_t)utrie2_get32(fCanonIterData->trie, c);
1913}
1914
1915const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1916 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
1917}
1918
1919UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1920 return getCanonValue(c)>=0;
1921}
1922
1923UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1924 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1925 if(canonValue==0) {
1926 return FALSE;
1927 }
1928 set.clear();
1929 int32_t value=canonValue&CANON_VALUE_MASK;
1930 if((canonValue&CANON_HAS_SET)!=0) {
1931 set.addAll(getCanonStartSet(value));
1932 } else if(value!=0) {
1933 set.add(value);
1934 }
1935 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1936 uint16_t norm16=getNorm16(c);
1937 if(norm16==JAMO_L) {
1938 UChar32 syllable=
1939 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1940 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1941 } else {
1942 addComposites(getCompositionsList(norm16), set);
1943 }
1944 }
1945 return TRUE;
1946}
1947
1948U_NAMESPACE_END
1949
1950// Normalizer2 data swapping ----------------------------------------------- ***
1951
1952U_NAMESPACE_USE
1953
1954U_CAPI int32_t U_EXPORT2
1955unorm2_swap(const UDataSwapper *ds,
1956 const void *inData, int32_t length, void *outData,
1957 UErrorCode *pErrorCode) {
1958 const UDataInfo *pInfo;
1959 int32_t headerSize;
1960
1961 const uint8_t *inBytes;
1962 uint8_t *outBytes;
1963
1964 const int32_t *inIndexes;
1965 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1966
1967 int32_t i, offset, nextOffset, size;
1968
1969 /* udata_swapDataHeader checks the arguments */
1970 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1971 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1972 return 0;
1973 }
1974
1975 /* check data format and format version */
1976 pInfo=(const UDataInfo *)((const char *)inData+4);
1977 if(!(
1978 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
1979 pInfo->dataFormat[1]==0x72 &&
1980 pInfo->dataFormat[2]==0x6d &&
1981 pInfo->dataFormat[3]==0x32 &&
1982 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
1983 )) {
1984 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
1985 pInfo->dataFormat[0], pInfo->dataFormat[1],
1986 pInfo->dataFormat[2], pInfo->dataFormat[3],
1987 pInfo->formatVersion[0]);
1988 *pErrorCode=U_UNSUPPORTED_ERROR;
1989 return 0;
1990 }
1991
1992 inBytes=(const uint8_t *)inData+headerSize;
1993 outBytes=(uint8_t *)outData+headerSize;
1994
1995 inIndexes=(const int32_t *)inBytes;
1996
1997 if(length>=0) {
1998 length-=headerSize;
1999 if(length<(int32_t)sizeof(indexes)) {
2000 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2001 length);
2002 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2003 return 0;
2004 }
2005 }
2006
2007 /* read the first few indexes */
2008 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
2009 indexes[i]=udata_readInt32(ds, inIndexes[i]);
2010 }
2011
2012 /* get the total length of the data */
2013 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2014
2015 if(length>=0) {
2016 if(length<size) {
2017 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2018 length);
2019 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2020 return 0;
2021 }
2022
2023 /* copy the data for inaccessible bytes */
2024 if(inBytes!=outBytes) {
2025 uprv_memcpy(outBytes, inBytes, size);
2026 }
2027
2028 offset=0;
2029
2030 /* swap the int32_t indexes[] */
2031 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2032 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2033 offset=nextOffset;
2034
2035 /* swap the UTrie2 */
2036 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2037 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2038 offset=nextOffset;
2039
2040 /* swap the uint16_t extraData[] */
2041 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2042 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2043 offset=nextOffset;
2044
2045 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2046 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2047 offset=nextOffset;
2048
2049 U_ASSERT(offset==size);
2050 }
2051
2052 return headerSize+size;
2053}
2054
2055#endif // !UCONFIG_NO_NORMALIZATION