blob: ce708e41628bbed10b43f6c41f9fb66f7f855cf9 [file] [log] [blame]
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001/*
2*******************************************************************************
3* Copyright (C) 1996-2014, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* rulebasedcollator.cpp
7*
8* (replaced the former tblcoll.cpp)
9*
10* created on: 2012feb14 with new and old collation code
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/coll.h"
19#include "unicode/coleitr.h"
20#include "unicode/localpointer.h"
21#include "unicode/locid.h"
22#include "unicode/sortkey.h"
23#include "unicode/tblcoll.h"
24#include "unicode/ucol.h"
25#include "unicode/uiter.h"
26#include "unicode/uloc.h"
27#include "unicode/uniset.h"
28#include "unicode/unistr.h"
29#include "unicode/usetiter.h"
30#include "unicode/utf8.h"
31#include "unicode/uversion.h"
32#include "bocsu.h"
33#include "charstr.h"
34#include "cmemory.h"
35#include "collation.h"
36#include "collationcompare.h"
37#include "collationdata.h"
38#include "collationdatareader.h"
39#include "collationfastlatin.h"
40#include "collationiterator.h"
41#include "collationkeys.h"
42#include "collationroot.h"
43#include "collationsets.h"
44#include "collationsettings.h"
45#include "collationtailoring.h"
46#include "cstring.h"
47#include "uassert.h"
48#include "ucol_imp.h"
49#include "uhash.h"
50#include "uitercollationiterator.h"
51#include "ustr_imp.h"
52#include "utf16collationiterator.h"
53#include "utf8collationiterator.h"
54#include "uvectr64.h"
55
56U_NAMESPACE_BEGIN
57
58namespace {
59
60class FixedSortKeyByteSink : public SortKeyByteSink {
61public:
62 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
63 : SortKeyByteSink(dest, destCapacity) {}
64 virtual ~FixedSortKeyByteSink();
65
66private:
67 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
68 virtual UBool Resize(int32_t appendCapacity, int32_t length);
69};
70
71FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
72
73void
74FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
75 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
76 // Fill the buffer completely.
77 int32_t available = capacity_ - length;
78 if (available > 0) {
79 uprv_memcpy(buffer_ + length, bytes, available);
80 }
81}
82
83UBool
84FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
85 return FALSE;
86}
87
88} // namespace
89
90// Not in an anonymous namespace, so that it can be a friend of CollationKey.
91class CollationKeyByteSink : public SortKeyByteSink {
92public:
93 CollationKeyByteSink(CollationKey &key)
94 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
95 key_(key) {}
96 virtual ~CollationKeyByteSink();
97
98private:
99 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
100 virtual UBool Resize(int32_t appendCapacity, int32_t length);
101
102 CollationKey &key_;
103};
104
105CollationKeyByteSink::~CollationKeyByteSink() {}
106
107void
108CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
109 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
110 if (Resize(n, length)) {
111 uprv_memcpy(buffer_ + length, bytes, n);
112 }
113}
114
115UBool
116CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
117 if (buffer_ == NULL) {
118 return FALSE; // allocation failed before already
119 }
120 int32_t newCapacity = 2 * capacity_;
121 int32_t altCapacity = length + 2 * appendCapacity;
122 if (newCapacity < altCapacity) {
123 newCapacity = altCapacity;
124 }
125 if (newCapacity < 200) {
126 newCapacity = 200;
127 }
128 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
129 if (newBuffer == NULL) {
130 SetNotOk();
131 return FALSE;
132 }
133 buffer_ = reinterpret_cast<char *>(newBuffer);
134 capacity_ = newCapacity;
135 return TRUE;
136}
137
138RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
139 : Collator(other),
140 data(other.data),
141 settings(other.settings),
142 tailoring(other.tailoring),
143 cacheEntry(other.cacheEntry),
144 validLocale(other.validLocale),
145 explicitlySetAttributes(other.explicitlySetAttributes),
146 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
147 settings->addRef();
148 cacheEntry->addRef();
149}
150
151RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
152 const RuleBasedCollator *base, UErrorCode &errorCode)
153 : data(NULL),
154 settings(NULL),
155 tailoring(NULL),
156 cacheEntry(NULL),
157 validLocale(""),
158 explicitlySetAttributes(0),
159 actualLocaleIsSameAsValid(FALSE) {
160 if(U_FAILURE(errorCode)) { return; }
161 if(bin == NULL || length == 0 || base == NULL) {
162 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
163 return;
164 }
165 const CollationTailoring *root = CollationRoot::getRoot(errorCode);
166 if(U_FAILURE(errorCode)) { return; }
167 if(base->tailoring != root) {
168 errorCode = U_UNSUPPORTED_ERROR;
169 return;
170 }
171 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
172 if(t.isNull() || t->isBogus()) {
173 errorCode = U_MEMORY_ALLOCATION_ERROR;
174 return;
175 }
176 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
177 if(U_FAILURE(errorCode)) { return; }
178 t->actualLocale.setToBogus();
179 adoptTailoring(t.orphan(), errorCode);
180}
181
182RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
183 : data(entry->tailoring->data),
184 settings(entry->tailoring->settings),
185 tailoring(entry->tailoring),
186 cacheEntry(entry),
187 validLocale(entry->validLocale),
188 explicitlySetAttributes(0),
189 actualLocaleIsSameAsValid(FALSE) {
190 settings->addRef();
191 cacheEntry->addRef();
192}
193
194RuleBasedCollator::~RuleBasedCollator() {
195 SharedObject::clearPtr(settings);
196 SharedObject::clearPtr(cacheEntry);
197}
198
199void
200RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
201 if(U_FAILURE(errorCode)) {
202 t->deleteIfZeroRefCount();
203 return;
204 }
205 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL && cacheEntry == NULL);
206 cacheEntry = new CollationCacheEntry(t->actualLocale, t);
207 if(cacheEntry == NULL) {
208 errorCode = U_MEMORY_ALLOCATION_ERROR;
209 t->deleteIfZeroRefCount();
210 return;
211 }
212 data = t->data;
213 settings = t->settings;
214 settings->addRef();
215 tailoring = t;
216 cacheEntry->addRef();
217 validLocale = t->actualLocale;
218 actualLocaleIsSameAsValid = FALSE;
219}
220
221Collator *
222RuleBasedCollator::clone() const {
223 return new RuleBasedCollator(*this);
224}
225
226RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
227 if(this == &other) { return *this; }
228 SharedObject::copyPtr(other.settings, settings);
229 tailoring = other.tailoring;
230 SharedObject::copyPtr(other.cacheEntry, cacheEntry);
231 data = tailoring->data;
232 validLocale = other.validLocale;
233 explicitlySetAttributes = other.explicitlySetAttributes;
234 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
235 return *this;
236}
237
238UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
239
240UBool
241RuleBasedCollator::operator==(const Collator& other) const {
242 if(this == &other) { return TRUE; }
243 if(!Collator::operator==(other)) { return FALSE; }
244 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
245 if(*settings != *o.settings) { return FALSE; }
246 if(data == o.data) { return TRUE; }
247 UBool thisIsRoot = data->base == NULL;
248 UBool otherIsRoot = o.data->base == NULL;
249 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
250 if(thisIsRoot != otherIsRoot) { return FALSE; }
251 if((thisIsRoot || !tailoring->rules.isEmpty()) &&
252 (otherIsRoot || !o.tailoring->rules.isEmpty())) {
253 // Shortcut: If both collators have valid rule strings, then compare those.
254 if(tailoring->rules == o.tailoring->rules) { return TRUE; }
255 }
256 // Different rule strings can result in the same or equivalent tailoring.
257 // The rule strings are optional in ICU resource bundles, although included by default.
258 // cloneBinary() drops the rule string.
259 UErrorCode errorCode = U_ZERO_ERROR;
260 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
261 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
262 if(U_FAILURE(errorCode)) { return FALSE; }
263 if(*thisTailored != *otherTailored) { return FALSE; }
264 // For completeness, we should compare all of the mappings;
265 // or we should create a list of strings, sort it with one collator,
266 // and check if both collators compare adjacent strings the same
267 // (order & strength, down to quaternary); or similar.
268 // Testing equality of collators seems unusual.
269 return TRUE;
270}
271
272int32_t
273RuleBasedCollator::hashCode() const {
274 int32_t h = settings->hashCode();
275 if(data->base == NULL) { return h; } // root collator
276 // Do not rely on the rule string, see comments in operator==().
277 UErrorCode errorCode = U_ZERO_ERROR;
278 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
279 if(U_FAILURE(errorCode)) { return 0; }
280 UnicodeSetIterator iter(*set);
281 while(iter.next() && !iter.isString()) {
282 h ^= data->getCE32(iter.getCodepoint());
283 }
284 return h;
285}
286
287void
288RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
289 const Locale &actual) {
290 if(actual == tailoring->actualLocale) {
291 actualLocaleIsSameAsValid = FALSE;
292 } else {
293 U_ASSERT(actual == valid);
294 actualLocaleIsSameAsValid = TRUE;
295 }
296 // Do not modify tailoring.actualLocale:
297 // We cannot be sure that that would be thread-safe.
298 validLocale = valid;
299 (void)requested; // Ignore, see also ticket #10477.
300}
301
302Locale
303RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
304 if(U_FAILURE(errorCode)) {
305 return Locale::getRoot();
306 }
307 switch(type) {
308 case ULOC_ACTUAL_LOCALE:
309 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
310 case ULOC_VALID_LOCALE:
311 return validLocale;
312 case ULOC_REQUESTED_LOCALE:
313 default:
314 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
315 return Locale::getRoot();
316 }
317}
318
319const char *
320RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
321 if(U_FAILURE(errorCode)) {
322 return NULL;
323 }
324 const Locale *result;
325 switch(type) {
326 case ULOC_ACTUAL_LOCALE:
327 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
328 break;
329 case ULOC_VALID_LOCALE:
330 result = &validLocale;
331 break;
332 case ULOC_REQUESTED_LOCALE:
333 default:
334 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
335 return NULL;
336 }
337 if(result->isBogus()) { return NULL; }
338 const char *id = result->getName();
339 return id[0] == 0 ? "root" : id;
340}
341
342const UnicodeString&
343RuleBasedCollator::getRules() const {
344 return tailoring->rules;
345}
346
347void
348RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
349 if(delta == UCOL_TAILORING_ONLY) {
350 buffer = tailoring->rules;
351 return;
352 }
353 // UCOL_FULL_RULES
354 buffer.remove();
355 CollationLoader::appendRootRules(buffer);
356 buffer.append(tailoring->rules).getTerminatedBuffer();
357}
358
359void
360RuleBasedCollator::getVersion(UVersionInfo version) const {
361 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
362 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
363}
364
365UnicodeSet *
366RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
367 if(U_FAILURE(errorCode)) { return NULL; }
368 UnicodeSet *tailored = new UnicodeSet();
369 if(tailored == NULL) {
370 errorCode = U_MEMORY_ALLOCATION_ERROR;
371 return NULL;
372 }
373 if(data->base != NULL) {
374 TailoredSet(tailored).forData(data, errorCode);
375 if(U_FAILURE(errorCode)) {
376 delete tailored;
377 return NULL;
378 }
379 }
380 return tailored;
381}
382
383void
384RuleBasedCollator::internalGetContractionsAndExpansions(
385 UnicodeSet *contractions, UnicodeSet *expansions,
386 UBool addPrefixes, UErrorCode &errorCode) const {
387 if(U_FAILURE(errorCode)) { return; }
388 if(contractions != NULL) {
389 contractions->clear();
390 }
391 if(expansions != NULL) {
392 expansions->clear();
393 }
394 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode);
395}
396
397void
398RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
399 if(U_FAILURE(errorCode)) { return; }
400 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode);
401}
402
403const CollationSettings &
404RuleBasedCollator::getDefaultSettings() const {
405 return *tailoring->settings;
406}
407
408UColAttributeValue
409RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
410 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
411 int32_t option;
412 switch(attr) {
413 case UCOL_FRENCH_COLLATION:
414 option = CollationSettings::BACKWARD_SECONDARY;
415 break;
416 case UCOL_ALTERNATE_HANDLING:
417 return settings->getAlternateHandling();
418 case UCOL_CASE_FIRST:
419 return settings->getCaseFirst();
420 case UCOL_CASE_LEVEL:
421 option = CollationSettings::CASE_LEVEL;
422 break;
423 case UCOL_NORMALIZATION_MODE:
424 option = CollationSettings::CHECK_FCD;
425 break;
426 case UCOL_STRENGTH:
427 return (UColAttributeValue)settings->getStrength();
428 case UCOL_HIRAGANA_QUATERNARY_MODE:
429 // Deprecated attribute, unsettable.
430 return UCOL_OFF;
431 case UCOL_NUMERIC_COLLATION:
432 option = CollationSettings::NUMERIC;
433 break;
434 default:
435 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
436 return UCOL_DEFAULT;
437 }
438 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
439}
440
441void
442RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
443 UErrorCode &errorCode) {
444 UColAttributeValue oldValue = getAttribute(attr, errorCode);
445 if(U_FAILURE(errorCode)) { return; }
446 if(value == oldValue) {
447 setAttributeExplicitly(attr);
448 return;
449 }
450 const CollationSettings &defaultSettings = getDefaultSettings();
451 if(settings == &defaultSettings) {
452 if(value == UCOL_DEFAULT) {
453 setAttributeDefault(attr);
454 return;
455 }
456 }
457 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
458 if(ownedSettings == NULL) {
459 errorCode = U_MEMORY_ALLOCATION_ERROR;
460 return;
461 }
462
463 switch(attr) {
464 case UCOL_FRENCH_COLLATION:
465 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
466 defaultSettings.options, errorCode);
467 break;
468 case UCOL_ALTERNATE_HANDLING:
469 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
470 break;
471 case UCOL_CASE_FIRST:
472 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
473 break;
474 case UCOL_CASE_LEVEL:
475 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
476 defaultSettings.options, errorCode);
477 break;
478 case UCOL_NORMALIZATION_MODE:
479 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
480 defaultSettings.options, errorCode);
481 break;
482 case UCOL_STRENGTH:
483 ownedSettings->setStrength(value, defaultSettings.options, errorCode);
484 break;
485 case UCOL_HIRAGANA_QUATERNARY_MODE:
486 // Deprecated attribute. Check for valid values but do not change anything.
487 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
488 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
489 }
490 break;
491 case UCOL_NUMERIC_COLLATION:
492 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
493 break;
494 default:
495 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
496 break;
497 }
498 if(U_FAILURE(errorCode)) { return; }
499 setFastLatinOptions(*ownedSettings);
500 if(value == UCOL_DEFAULT) {
501 setAttributeDefault(attr);
502 } else {
503 setAttributeExplicitly(attr);
504 }
505}
506
507Collator &
508RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
509 if(U_FAILURE(errorCode)) { return *this; }
510 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
511 int32_t value;
512 if(group == UCOL_REORDER_CODE_DEFAULT) {
513 value = UCOL_DEFAULT;
514 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
515 value = group - UCOL_REORDER_CODE_FIRST;
516 } else {
517 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
518 return *this;
519 }
520 CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
521 if(value == oldValue) {
522 setAttributeExplicitly(ATTR_VARIABLE_TOP);
523 return *this;
524 }
525 const CollationSettings &defaultSettings = getDefaultSettings();
526 if(settings == &defaultSettings) {
527 if(value == UCOL_DEFAULT) {
528 setAttributeDefault(ATTR_VARIABLE_TOP);
529 return *this;
530 }
531 }
532 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
533 if(ownedSettings == NULL) {
534 errorCode = U_MEMORY_ALLOCATION_ERROR;
535 return *this;
536 }
537
538 if(group == UCOL_REORDER_CODE_DEFAULT) {
539 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable());
540 }
541 uint32_t varTop = data->getLastPrimaryForGroup(group);
542 U_ASSERT(varTop != 0);
543 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
544 if(U_FAILURE(errorCode)) { return *this; }
545 ownedSettings->variableTop = varTop;
546 setFastLatinOptions(*ownedSettings);
547 if(value == UCOL_DEFAULT) {
548 setAttributeDefault(ATTR_VARIABLE_TOP);
549 } else {
550 setAttributeExplicitly(ATTR_VARIABLE_TOP);
551 }
552 return *this;
553}
554
555UColReorderCode
556RuleBasedCollator::getMaxVariable() const {
557 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
558}
559
560uint32_t
561RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
562 return settings->variableTop;
563}
564
565uint32_t
566RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) {
567 if(U_FAILURE(errorCode)) { return 0; }
568 if(varTop == NULL && len !=0) {
569 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
570 return 0;
571 }
572 if(len < 0) { len = u_strlen(varTop); }
573 if(len == 0) {
574 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
575 return 0;
576 }
577 UBool numeric = settings->isNumeric();
578 int64_t ce1, ce2;
579 if(settings->dontCheckFCD()) {
580 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
581 ce1 = ci.nextCE(errorCode);
582 ce2 = ci.nextCE(errorCode);
583 } else {
584 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
585 ce1 = ci.nextCE(errorCode);
586 ce2 = ci.nextCE(errorCode);
587 }
588 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
589 errorCode = U_CE_NOT_FOUND_ERROR;
590 return 0;
591 }
592 setVariableTop((uint32_t)(ce1 >> 32), errorCode);
593 return settings->variableTop;
594}
595
596uint32_t
597RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
598 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
599}
600
601void
602RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
603 if(U_FAILURE(errorCode)) { return; }
604 if(varTop != settings->variableTop) {
605 // Pin the variable top to the end of the reordering group which contains it.
606 // Only a few special groups are supported.
607 int32_t group = data->getGroupForPrimary(varTop);
608 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
609 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
610 return;
611 }
612 uint32_t v = data->getLastPrimaryForGroup(group);
613 U_ASSERT(v != 0 && v >= varTop);
614 varTop = v;
615 if(varTop != settings->variableTop) {
616 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
617 if(ownedSettings == NULL) {
618 errorCode = U_MEMORY_ALLOCATION_ERROR;
619 return;
620 }
621 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
622 getDefaultSettings().options, errorCode);
623 if(U_FAILURE(errorCode)) { return; }
624 ownedSettings->variableTop = varTop;
625 setFastLatinOptions(*ownedSettings);
626 }
627 }
628 if(varTop == getDefaultSettings().variableTop) {
629 setAttributeDefault(ATTR_VARIABLE_TOP);
630 } else {
631 setAttributeExplicitly(ATTR_VARIABLE_TOP);
632 }
633}
634
635int32_t
636RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
637 UErrorCode &errorCode) const {
638 if(U_FAILURE(errorCode)) { return 0; }
639 if(capacity < 0 || (dest == NULL && capacity > 0)) {
640 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
641 return 0;
642 }
643 int32_t length = settings->reorderCodesLength;
644 if(length == 0) { return 0; }
645 if(length > capacity) {
646 errorCode = U_BUFFER_OVERFLOW_ERROR;
647 return length;
648 }
649 uprv_memcpy(dest, settings->reorderCodes, length * 4);
650 return length;
651}
652
653void
654RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
655 UErrorCode &errorCode) {
656 if(U_FAILURE(errorCode)) { return; }
657 if(length < 0 || (reorderCodes == NULL && length > 0)) {
658 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
659 return;
660 }
661 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
662 length = 0;
663 }
664 if(length == settings->reorderCodesLength &&
665 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
666 return;
667 }
668 const CollationSettings &defaultSettings = getDefaultSettings();
669 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
670 if(settings != &defaultSettings) {
671 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
672 if(ownedSettings == NULL) {
673 errorCode = U_MEMORY_ALLOCATION_ERROR;
674 return;
675 }
676 ownedSettings->aliasReordering(defaultSettings.reorderCodes,
677 defaultSettings.reorderCodesLength,
678 defaultSettings.reorderTable);
679 setFastLatinOptions(*ownedSettings);
680 }
681 return;
682 }
683 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
684 if(ownedSettings == NULL) {
685 errorCode = U_MEMORY_ALLOCATION_ERROR;
686 return;
687 }
688 if(length == 0) {
689 ownedSettings->resetReordering();
690 } else {
691 uint8_t reorderTable[256];
692 data->makeReorderTable(reorderCodes, length, reorderTable, errorCode);
693 if(U_FAILURE(errorCode)) { return; }
694 if(!ownedSettings->setReordering(reorderCodes, length, reorderTable)) {
695 errorCode = U_MEMORY_ALLOCATION_ERROR;
696 return;
697 }
698 }
699 setFastLatinOptions(*ownedSettings);
700}
701
702void
703RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
704 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
705 data, ownedSettings,
706 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
707}
708
709UCollationResult
710RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
711 UErrorCode &errorCode) const {
712 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
713 return doCompare(left.getBuffer(), left.length(),
714 right.getBuffer(), right.length(), errorCode);
715}
716
717UCollationResult
718RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
719 int32_t length, UErrorCode &errorCode) const {
720 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
721 if(length < 0) {
722 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
723 return UCOL_EQUAL;
724 }
725 int32_t leftLength = left.length();
726 int32_t rightLength = right.length();
727 if(leftLength > length) { leftLength = length; }
728 if(rightLength > length) { rightLength = length; }
729 return doCompare(left.getBuffer(), leftLength,
730 right.getBuffer(), rightLength, errorCode);
731}
732
733UCollationResult
734RuleBasedCollator::compare(const UChar *left, int32_t leftLength,
735 const UChar *right, int32_t rightLength,
736 UErrorCode &errorCode) const {
737 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
738 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
739 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
740 return UCOL_EQUAL;
741 }
742 // Make sure both or neither strings have a known length.
743 // We do not optimize for mixed length/termination.
744 if(leftLength >= 0) {
745 if(rightLength < 0) { rightLength = u_strlen(right); }
746 } else {
747 if(rightLength >= 0) { leftLength = u_strlen(left); }
748 }
749 return doCompare(left, leftLength, right, rightLength, errorCode);
750}
751
752UCollationResult
753RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
754 UErrorCode &errorCode) const {
755 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
756 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
757 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
758 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) {
759 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
760 return UCOL_EQUAL;
761 }
762 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
763}
764
765UCollationResult
766RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
767 const char *right, int32_t rightLength,
768 UErrorCode &errorCode) const {
769 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
770 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
771 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
772 return UCOL_EQUAL;
773 }
774 // Make sure both or neither strings have a known length.
775 // We do not optimize for mixed length/termination.
776 if(leftLength >= 0) {
777 if(rightLength < 0) { rightLength = uprv_strlen(right); }
778 } else {
779 if(rightLength >= 0) { leftLength = uprv_strlen(left); }
780 }
781 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
782 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
783}
784
785namespace {
786
787/**
788 * Abstract iterator for identical-level string comparisons.
789 * Returns FCD code points and handles temporary switching to NFD.
790 */
791class NFDIterator : public UObject {
792public:
793 NFDIterator() : index(-1), length(0) {}
794 virtual ~NFDIterator() {}
795 /**
796 * Returns the next code point from the internal normalization buffer,
797 * or else the next text code point.
798 * Returns -1 at the end of the text.
799 */
800 UChar32 nextCodePoint() {
801 if(index >= 0) {
802 if(index == length) {
803 index = -1;
804 } else {
805 UChar32 c;
806 U16_NEXT_UNSAFE(decomp, index, c);
807 return c;
808 }
809 }
810 return nextRawCodePoint();
811 }
812 /**
813 * @param nfcImpl
814 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
815 * @return the first code point in c's decomposition,
816 * or c itself if it was decomposed already or if it does not decompose
817 */
818 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
819 if(index >= 0) { return c; }
820 decomp = nfcImpl.getDecomposition(c, buffer, length);
821 if(decomp == NULL) { return c; }
822 index = 0;
823 U16_NEXT_UNSAFE(decomp, index, c);
824 return c;
825 }
826protected:
827 /**
828 * Returns the next text code point in FCD order.
829 * Returns -1 at the end of the text.
830 */
831 virtual UChar32 nextRawCodePoint() = 0;
832private:
833 const UChar *decomp;
834 UChar buffer[4];
835 int32_t index;
836 int32_t length;
837};
838
839class UTF16NFDIterator : public NFDIterator {
840public:
841 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {}
842protected:
843 virtual UChar32 nextRawCodePoint() {
844 if(s == limit) { return U_SENTINEL; }
845 UChar32 c = *s++;
846 if(limit == NULL && c == 0) {
847 s = NULL;
848 return U_SENTINEL;
849 }
850 UChar trail;
851 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
852 ++s;
853 c = U16_GET_SUPPLEMENTARY(c, trail);
854 }
855 return c;
856 }
857
858 const UChar *s;
859 const UChar *limit;
860};
861
862class FCDUTF16NFDIterator : public UTF16NFDIterator {
863public:
864 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit)
865 : UTF16NFDIterator(NULL, NULL) {
866 UErrorCode errorCode = U_ZERO_ERROR;
867 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode);
868 if(U_FAILURE(errorCode)) { return; }
869 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) {
870 s = text;
871 limit = spanLimit;
872 } else {
873 str.setTo(text, (int32_t)(spanLimit - text));
874 {
875 ReorderingBuffer buffer(nfcImpl, str);
876 if(buffer.init(str.length(), errorCode)) {
877 nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode);
878 }
879 }
880 if(U_SUCCESS(errorCode)) {
881 s = str.getBuffer();
882 limit = s + str.length();
883 }
884 }
885 }
886private:
887 UnicodeString str;
888};
889
890class UTF8NFDIterator : public NFDIterator {
891public:
892 UTF8NFDIterator(const uint8_t *text, int32_t textLength)
893 : s(text), pos(0), length(textLength) {}
894protected:
895 virtual UChar32 nextRawCodePoint() {
896 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
897 UChar32 c;
898 U8_NEXT_OR_FFFD(s, pos, length, c);
899 return c;
900 }
901
902 const uint8_t *s;
903 int32_t pos;
904 int32_t length;
905};
906
907class FCDUTF8NFDIterator : public NFDIterator {
908public:
909 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
910 : u8ci(data, FALSE, text, 0, textLength) {}
911protected:
912 virtual UChar32 nextRawCodePoint() {
913 UErrorCode errorCode = U_ZERO_ERROR;
914 return u8ci.nextCodePoint(errorCode);
915 }
916private:
917 FCDUTF8CollationIterator u8ci;
918};
919
920class UIterNFDIterator : public NFDIterator {
921public:
922 UIterNFDIterator(UCharIterator &it) : iter(it) {}
923protected:
924 virtual UChar32 nextRawCodePoint() {
925 return uiter_next32(&iter);
926 }
927private:
928 UCharIterator &iter;
929};
930
931class FCDUIterNFDIterator : public NFDIterator {
932public:
933 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
934 : uici(data, FALSE, it, startIndex) {}
935protected:
936 virtual UChar32 nextRawCodePoint() {
937 UErrorCode errorCode = U_ZERO_ERROR;
938 return uici.nextCodePoint(errorCode);
939 }
940private:
941 FCDUIterCollationIterator uici;
942};
943
944UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
945 NFDIterator &left, NFDIterator &right) {
946 for(;;) {
947 // Fetch the next FCD code point from each string.
948 UChar32 leftCp = left.nextCodePoint();
949 UChar32 rightCp = right.nextCodePoint();
950 if(leftCp == rightCp) {
951 if(leftCp < 0) { break; }
952 continue;
953 }
954 // If they are different, then decompose each and compare again.
955 if(leftCp < 0) {
956 leftCp = -2; // end of string
957 } else if(leftCp == 0xfffe) {
958 leftCp = -1; // U+FFFE: merge separator
959 } else {
960 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
961 }
962 if(rightCp < 0) {
963 rightCp = -2; // end of string
964 } else if(rightCp == 0xfffe) {
965 rightCp = -1; // U+FFFE: merge separator
966 } else {
967 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
968 }
969 if(leftCp < rightCp) { return UCOL_LESS; }
970 if(leftCp > rightCp) { return UCOL_GREATER; }
971 }
972 return UCOL_EQUAL;
973}
974
975} // namespace
976
977UCollationResult
978RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength,
979 const UChar *right, int32_t rightLength,
980 UErrorCode &errorCode) const {
981 // U_FAILURE(errorCode) checked by caller.
982 if(left == right && leftLength == rightLength) {
983 return UCOL_EQUAL;
984 }
985
986 // Identical-prefix test.
987 const UChar *leftLimit;
988 const UChar *rightLimit;
989 int32_t equalPrefixLength = 0;
990 if(leftLength < 0) {
991 leftLimit = NULL;
992 rightLimit = NULL;
993 UChar c;
994 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
995 if(c == 0) { return UCOL_EQUAL; }
996 ++equalPrefixLength;
997 }
998 } else {
999 leftLimit = left + leftLength;
1000 rightLimit = right + rightLength;
1001 for(;;) {
1002 if(equalPrefixLength == leftLength) {
1003 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1004 break;
1005 } else if(equalPrefixLength == rightLength ||
1006 left[equalPrefixLength] != right[equalPrefixLength]) {
1007 break;
1008 }
1009 ++equalPrefixLength;
1010 }
1011 }
1012
1013 UBool numeric = settings->isNumeric();
1014 if(equalPrefixLength > 0) {
1015 if((equalPrefixLength != leftLength &&
1016 data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
1017 (equalPrefixLength != rightLength &&
1018 data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
1019 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1020 while(--equalPrefixLength > 0 &&
1021 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
1022 }
1023 // Notes:
1024 // - A longer string can compare equal to a prefix of it if only ignorables follow.
1025 // - With a backward level, a longer string can compare less-than a prefix of it.
1026
1027 // Pass the actual start of each string into the CollationIterators,
1028 // plus the equalPrefixLength position,
1029 // so that prefix matches back into the equal prefix work.
1030 }
1031
1032 int32_t result;
1033 int32_t fastLatinOptions = settings->fastLatinOptions;
1034 if(fastLatinOptions >= 0 &&
1035 (equalPrefixLength == leftLength ||
1036 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
1037 (equalPrefixLength == rightLength ||
1038 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
1039 if(leftLength >= 0) {
1040 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1041 settings->fastLatinPrimaries,
1042 fastLatinOptions,
1043 left + equalPrefixLength,
1044 leftLength - equalPrefixLength,
1045 right + equalPrefixLength,
1046 rightLength - equalPrefixLength);
1047 } else {
1048 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1049 settings->fastLatinPrimaries,
1050 fastLatinOptions,
1051 left + equalPrefixLength, -1,
1052 right + equalPrefixLength, -1);
1053 }
1054 } else {
1055 result = CollationFastLatin::BAIL_OUT_RESULT;
1056 }
1057
1058 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1059 if(settings->dontCheckFCD()) {
1060 UTF16CollationIterator leftIter(data, numeric,
1061 left, left + equalPrefixLength, leftLimit);
1062 UTF16CollationIterator rightIter(data, numeric,
1063 right, right + equalPrefixLength, rightLimit);
1064 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1065 } else {
1066 FCDUTF16CollationIterator leftIter(data, numeric,
1067 left, left + equalPrefixLength, leftLimit);
1068 FCDUTF16CollationIterator rightIter(data, numeric,
1069 right, right + equalPrefixLength, rightLimit);
1070 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1071 }
1072 }
1073 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1074 return (UCollationResult)result;
1075 }
1076
1077 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1078 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1079 // and the benefit seems unlikely to be measurable.
1080
1081 // Compare identical level.
1082 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1083 left += equalPrefixLength;
1084 right += equalPrefixLength;
1085 if(settings->dontCheckFCD()) {
1086 UTF16NFDIterator leftIter(left, leftLimit);
1087 UTF16NFDIterator rightIter(right, rightLimit);
1088 return compareNFDIter(nfcImpl, leftIter, rightIter);
1089 } else {
1090 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
1091 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
1092 return compareNFDIter(nfcImpl, leftIter, rightIter);
1093 }
1094}
1095
1096UCollationResult
1097RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
1098 const uint8_t *right, int32_t rightLength,
1099 UErrorCode &errorCode) const {
1100 // U_FAILURE(errorCode) checked by caller.
1101 if(left == right && leftLength == rightLength) {
1102 return UCOL_EQUAL;
1103 }
1104
1105 // Identical-prefix test.
1106 int32_t equalPrefixLength = 0;
1107 if(leftLength < 0) {
1108 uint8_t c;
1109 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
1110 if(c == 0) { return UCOL_EQUAL; }
1111 ++equalPrefixLength;
1112 }
1113 } else {
1114 for(;;) {
1115 if(equalPrefixLength == leftLength) {
1116 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1117 break;
1118 } else if(equalPrefixLength == rightLength ||
1119 left[equalPrefixLength] != right[equalPrefixLength]) {
1120 break;
1121 }
1122 ++equalPrefixLength;
1123 }
1124 }
1125 // Back up to the start of a partially-equal code point.
1126 if(equalPrefixLength > 0 &&
1127 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
1128 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
1129 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
1130 }
1131
1132 UBool numeric = settings->isNumeric();
1133 if(equalPrefixLength > 0) {
1134 UBool unsafe = FALSE;
1135 if(equalPrefixLength != leftLength) {
1136 int32_t i = equalPrefixLength;
1137 UChar32 c;
1138 U8_NEXT_OR_FFFD(left, i, leftLength, c);
1139 unsafe = data->isUnsafeBackward(c, numeric);
1140 }
1141 if(!unsafe && equalPrefixLength != rightLength) {
1142 int32_t i = equalPrefixLength;
1143 UChar32 c;
1144 U8_NEXT_OR_FFFD(right, i, rightLength, c);
1145 unsafe = data->isUnsafeBackward(c, numeric);
1146 }
1147 if(unsafe) {
1148 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1149 UChar32 c;
1150 do {
1151 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
1152 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
1153 }
1154 // See the notes in the UTF-16 version.
1155
1156 // Pass the actual start of each string into the CollationIterators,
1157 // plus the equalPrefixLength position,
1158 // so that prefix matches back into the equal prefix work.
1159 }
1160
1161 int32_t result;
1162 int32_t fastLatinOptions = settings->fastLatinOptions;
1163 if(fastLatinOptions >= 0 &&
1164 (equalPrefixLength == leftLength ||
1165 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
1166 (equalPrefixLength == rightLength ||
1167 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
1168 if(leftLength >= 0) {
1169 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1170 settings->fastLatinPrimaries,
1171 fastLatinOptions,
1172 left + equalPrefixLength,
1173 leftLength - equalPrefixLength,
1174 right + equalPrefixLength,
1175 rightLength - equalPrefixLength);
1176 } else {
1177 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1178 settings->fastLatinPrimaries,
1179 fastLatinOptions,
1180 left + equalPrefixLength, -1,
1181 right + equalPrefixLength, -1);
1182 }
1183 } else {
1184 result = CollationFastLatin::BAIL_OUT_RESULT;
1185 }
1186
1187 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1188 if(settings->dontCheckFCD()) {
1189 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1190 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1191 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1192 } else {
1193 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1194 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1195 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1196 }
1197 }
1198 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1199 return (UCollationResult)result;
1200 }
1201
1202 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1203 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1204 // and the benefit seems unlikely to be measurable.
1205
1206 // Compare identical level.
1207 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1208 left += equalPrefixLength;
1209 right += equalPrefixLength;
1210 if(leftLength > 0) {
1211 leftLength -= equalPrefixLength;
1212 rightLength -= equalPrefixLength;
1213 }
1214 if(settings->dontCheckFCD()) {
1215 UTF8NFDIterator leftIter(left, leftLength);
1216 UTF8NFDIterator rightIter(right, rightLength);
1217 return compareNFDIter(nfcImpl, leftIter, rightIter);
1218 } else {
1219 FCDUTF8NFDIterator leftIter(data, left, leftLength);
1220 FCDUTF8NFDIterator rightIter(data, right, rightLength);
1221 return compareNFDIter(nfcImpl, leftIter, rightIter);
1222 }
1223}
1224
1225UCollationResult
1226RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
1227 UErrorCode &errorCode) const {
1228 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
1229 UBool numeric = settings->isNumeric();
1230
1231 // Identical-prefix test.
1232 int32_t equalPrefixLength = 0;
1233 {
1234 UChar32 leftUnit;
1235 UChar32 rightUnit;
1236 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
1237 if(leftUnit < 0) { return UCOL_EQUAL; }
1238 ++equalPrefixLength;
1239 }
1240
1241 // Back out the code units that differed, for the real collation comparison.
1242 if(leftUnit >= 0) { left.previous(&left); }
1243 if(rightUnit >= 0) { right.previous(&right); }
1244
1245 if(equalPrefixLength > 0) {
1246 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
1247 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
1248 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1249 do {
1250 --equalPrefixLength;
1251 leftUnit = left.previous(&left);
1252 right.previous(&right);
1253 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
1254 }
1255 // See the notes in the UTF-16 version.
1256 }
1257 }
1258
1259 UCollationResult result;
1260 if(settings->dontCheckFCD()) {
1261 UIterCollationIterator leftIter(data, numeric, left);
1262 UIterCollationIterator rightIter(data, numeric, right);
1263 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1264 } else {
1265 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
1266 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
1267 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1268 }
1269 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1270 return result;
1271 }
1272
1273 // Compare identical level.
1274 left.move(&left, equalPrefixLength, UITER_ZERO);
1275 right.move(&right, equalPrefixLength, UITER_ZERO);
1276 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1277 if(settings->dontCheckFCD()) {
1278 UIterNFDIterator leftIter(left);
1279 UIterNFDIterator rightIter(right);
1280 return compareNFDIter(nfcImpl, leftIter, rightIter);
1281 } else {
1282 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
1283 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
1284 return compareNFDIter(nfcImpl, leftIter, rightIter);
1285 }
1286}
1287
1288CollationKey &
1289RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
1290 UErrorCode &errorCode) const {
1291 return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
1292}
1293
1294CollationKey &
1295RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key,
1296 UErrorCode &errorCode) const {
1297 if(U_FAILURE(errorCode)) {
1298 return key.setToBogus();
1299 }
1300 if(s == NULL && length != 0) {
1301 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1302 return key.setToBogus();
1303 }
1304 key.reset(); // resets the "bogus" state
1305 CollationKeyByteSink sink(key);
1306 writeSortKey(s, length, sink, errorCode);
1307 if(U_FAILURE(errorCode)) {
1308 key.setToBogus();
1309 } else if(key.isBogus()) {
1310 errorCode = U_MEMORY_ALLOCATION_ERROR;
1311 } else {
1312 key.setLength(sink.NumberOfBytesAppended());
1313 }
1314 return key;
1315}
1316
1317int32_t
1318RuleBasedCollator::getSortKey(const UnicodeString &s,
1319 uint8_t *dest, int32_t capacity) const {
1320 return getSortKey(s.getBuffer(), s.length(), dest, capacity);
1321}
1322
1323int32_t
1324RuleBasedCollator::getSortKey(const UChar *s, int32_t length,
1325 uint8_t *dest, int32_t capacity) const {
1326 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) {
1327 return 0;
1328 }
1329 uint8_t noDest[1] = { 0 };
1330 if(dest == NULL) {
1331 // Distinguish pure preflighting from an allocation error.
1332 dest = noDest;
1333 capacity = 0;
1334 }
1335 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
1336 UErrorCode errorCode = U_ZERO_ERROR;
1337 writeSortKey(s, length, sink, errorCode);
1338 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
1339}
1340
1341void
1342RuleBasedCollator::writeSortKey(const UChar *s, int32_t length,
1343 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1344 if(U_FAILURE(errorCode)) { return; }
1345 const UChar *limit = (length >= 0) ? s + length : NULL;
1346 UBool numeric = settings->isNumeric();
1347 CollationKeys::LevelCallback callback;
1348 if(settings->dontCheckFCD()) {
1349 UTF16CollationIterator iter(data, numeric, s, s, limit);
1350 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1351 sink, Collation::PRIMARY_LEVEL,
1352 callback, TRUE, errorCode);
1353 } else {
1354 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1355 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1356 sink, Collation::PRIMARY_LEVEL,
1357 callback, TRUE, errorCode);
1358 }
1359 if(settings->getStrength() == UCOL_IDENTICAL) {
1360 writeIdenticalLevel(s, limit, sink, errorCode);
1361 }
1362 static const char terminator = 0; // TERMINATOR_BYTE
1363 sink.Append(&terminator, 1);
1364}
1365
1366void
1367RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit,
1368 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1369 // NFD quick check
1370 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode);
1371 if(U_FAILURE(errorCode)) { return; }
1372 sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
1373 UChar32 prev = 0;
1374 if(nfdQCYesLimit != s) {
1375 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
1376 }
1377 // Is there non-NFD text?
1378 int32_t destLengthEstimate;
1379 if(limit != NULL) {
1380 if(nfdQCYesLimit == limit) { return; }
1381 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
1382 } else {
1383 // s is NUL-terminated
1384 if(*nfdQCYesLimit == 0) { return; }
1385 destLengthEstimate = -1;
1386 }
1387 UnicodeString nfd;
1388 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
1389 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
1390}
1391
1392namespace {
1393
1394/**
1395 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1396 * with an instance of this callback class.
1397 * When another level is about to be written, the callback
1398 * records the level and the number of bytes that will be written until
1399 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1400 *
1401 * When internalNextSortKeyPart() is called again, it restarts with the last level
1402 * and ignores as many bytes as were written previously for that level.
1403 */
1404class PartLevelCallback : public CollationKeys::LevelCallback {
1405public:
1406 PartLevelCallback(const SortKeyByteSink &s)
1407 : sink(s), level(Collation::PRIMARY_LEVEL) {
1408 levelCapacity = sink.GetRemainingCapacity();
1409 }
1410 virtual ~PartLevelCallback() {}
1411 virtual UBool needToWrite(Collation::Level l) {
1412 if(!sink.Overflowed()) {
1413 // Remember a level that will be at least partially written.
1414 level = l;
1415 levelCapacity = sink.GetRemainingCapacity();
1416 return TRUE;
1417 } else {
1418 return FALSE;
1419 }
1420 }
1421 Collation::Level getLevel() const { return level; }
1422 int32_t getLevelCapacity() const { return levelCapacity; }
1423
1424private:
1425 const SortKeyByteSink &sink;
1426 Collation::Level level;
1427 int32_t levelCapacity;
1428};
1429
1430} // namespace
1431
1432int32_t
1433RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
1434 uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
1435 if(U_FAILURE(errorCode)) { return 0; }
1436 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) {
1437 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1438 return 0;
1439 }
1440 if(count == 0) { return 0; }
1441
1442 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
1443 sink.IgnoreBytes((int32_t)state[1]);
1444 iter->move(iter, 0, UITER_START);
1445
1446 Collation::Level level = (Collation::Level)state[0];
1447 if(level <= Collation::QUATERNARY_LEVEL) {
1448 UBool numeric = settings->isNumeric();
1449 PartLevelCallback callback(sink);
1450 if(settings->dontCheckFCD()) {
1451 UIterCollationIterator ci(data, numeric, *iter);
1452 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1453 sink, level, callback, FALSE, errorCode);
1454 } else {
1455 FCDUIterCollationIterator ci(data, numeric, *iter, 0);
1456 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1457 sink, level, callback, FALSE, errorCode);
1458 }
1459 if(U_FAILURE(errorCode)) { return 0; }
1460 if(sink.NumberOfBytesAppended() > count) {
1461 state[0] = (uint32_t)callback.getLevel();
1462 state[1] = (uint32_t)callback.getLevelCapacity();
1463 return count;
1464 }
1465 // All of the normal levels are done.
1466 if(settings->getStrength() == UCOL_IDENTICAL) {
1467 level = Collation::IDENTICAL_LEVEL;
1468 iter->move(iter, 0, UITER_START);
1469 }
1470 // else fall through to setting ZERO_LEVEL
1471 }
1472
1473 if(level == Collation::IDENTICAL_LEVEL) {
1474 int32_t levelCapacity = sink.GetRemainingCapacity();
1475 UnicodeString s;
1476 for(;;) {
1477 UChar32 c = iter->next(iter);
1478 if(c < 0) { break; }
1479 s.append((UChar)c);
1480 }
1481 const UChar *sArray = s.getBuffer();
1482 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
1483 if(U_FAILURE(errorCode)) { return 0; }
1484 if(sink.NumberOfBytesAppended() > count) {
1485 state[0] = (uint32_t)level;
1486 state[1] = (uint32_t)levelCapacity;
1487 return count;
1488 }
1489 }
1490
1491 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1492 state[0] = (uint32_t)Collation::ZERO_LEVEL;
1493 state[1] = 0;
1494 int32_t length = sink.NumberOfBytesAppended();
1495 int32_t i = length;
1496 while(i < count) { dest[i++] = 0; }
1497 return length;
1498}
1499
1500void
1501RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
1502 UErrorCode &errorCode) const {
1503 if(U_FAILURE(errorCode)) { return; }
1504 const UChar *s = str.getBuffer();
1505 const UChar *limit = s + str.length();
1506 UBool numeric = settings->isNumeric();
1507 if(settings->dontCheckFCD()) {
1508 UTF16CollationIterator iter(data, numeric, s, s, limit);
1509 int64_t ce;
1510 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1511 ces.addElement(ce, errorCode);
1512 }
1513 } else {
1514 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1515 int64_t ce;
1516 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1517 ces.addElement(ce, errorCode);
1518 }
1519 }
1520}
1521
1522namespace {
1523
1524void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
1525 UErrorCode &errorCode) {
1526 if(U_FAILURE(errorCode) || length == 0) { return; }
1527 if(!s.isEmpty()) {
1528 s.append('_', errorCode);
1529 }
1530 s.append(letter, errorCode);
1531 for(int32_t i = 0; i < length; ++i) {
1532 s.append(uprv_toupper(subtag[i]), errorCode);
1533 }
1534}
1535
1536void appendAttribute(CharString &s, char letter, UColAttributeValue value,
1537 UErrorCode &errorCode) {
1538 if(U_FAILURE(errorCode)) { return; }
1539 if(!s.isEmpty()) {
1540 s.append('_', errorCode);
1541 }
1542 static const char *valueChars = "1234...........IXO..SN..LU......";
1543 s.append(letter, errorCode);
1544 s.append(valueChars[value], errorCode);
1545}
1546
1547} // namespace
1548
1549int32_t
1550RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
1551 char *buffer, int32_t capacity,
1552 UErrorCode &errorCode) const {
1553 if(U_FAILURE(errorCode)) { return 0; }
1554 if(buffer == NULL ? capacity != 0 : capacity < 0) {
1555 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1556 return 0;
1557 }
1558 if(locale == NULL) {
1559 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
1560 }
1561
1562 char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
1563 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
1564 "collation", locale,
1565 NULL, &errorCode);
1566 if(U_FAILURE(errorCode)) { return 0; }
1567 if(length == 0) {
1568 uprv_strcpy(resultLocale, "root");
1569 } else {
1570 resultLocale[length] = 0;
1571 }
1572
1573 // Append items in alphabetic order of their short definition letters.
1574 CharString result;
1575 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1576
1577 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
1578 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
1579 }
1580 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1581 // See ICU tickets #10372 and #10386.
1582 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
1583 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
1584 }
1585 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
1586 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
1587 }
1588 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
1589 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
1590 }
1591 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
1592 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
1593 }
1594 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1595 length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode);
1596 appendSubtag(result, 'K', subtag, length, errorCode);
1597 length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1598 appendSubtag(result, 'L', subtag, length, errorCode);
1599 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
1600 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
1601 }
1602 length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1603 appendSubtag(result, 'R', subtag, length, errorCode);
1604 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
1605 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
1606 }
1607 length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1608 appendSubtag(result, 'V', subtag, length, errorCode);
1609 length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1610 appendSubtag(result, 'Z', subtag, length, errorCode);
1611
1612 if(U_FAILURE(errorCode)) { return 0; }
1613 if(result.length() <= capacity) {
1614 uprv_memcpy(buffer, result.data(), result.length());
1615 }
1616 return u_terminateChars(buffer, capacity, result.length(), &errorCode);
1617}
1618
1619UBool
1620RuleBasedCollator::isUnsafe(UChar32 c) const {
1621 return data->isUnsafeBackward(c, settings->isNumeric());
1622}
1623
1624void
1625RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
1626 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
1627}
1628
1629UBool
1630RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
1631 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
1632 return U_SUCCESS(errorCode);
1633}
1634
1635CollationElementIterator *
1636RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
1637 UErrorCode errorCode = U_ZERO_ERROR;
1638 if(!initMaxExpansions(errorCode)) { return NULL; }
1639 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1640 if(U_FAILURE(errorCode)) {
1641 delete cei;
1642 return NULL;
1643 }
1644 return cei;
1645}
1646
1647CollationElementIterator *
1648RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
1649 UErrorCode errorCode = U_ZERO_ERROR;
1650 if(!initMaxExpansions(errorCode)) { return NULL; }
1651 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1652 if(U_FAILURE(errorCode)) {
1653 delete cei;
1654 return NULL;
1655 }
1656 return cei;
1657}
1658
1659int32_t
1660RuleBasedCollator::getMaxExpansion(int32_t order) const {
1661 UErrorCode errorCode = U_ZERO_ERROR;
1662 (void)initMaxExpansions(errorCode);
1663 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
1664}
1665
1666U_NAMESPACE_END
1667
1668#endif // !UCONFIG_NO_COLLATION