Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 1 | /* |
| 2 | ******************************************************************************* |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 3 | * Copyright (C) 2013-2015, International Business Machines |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 4 | * Corporation and others. All Rights Reserved. |
| 5 | ******************************************************************************* |
| 6 | * collationsettings.cpp |
| 7 | * |
| 8 | * created on: 2013feb07 |
| 9 | * created by: Markus W. Scherer |
| 10 | */ |
| 11 | |
| 12 | #include "unicode/utypes.h" |
| 13 | |
| 14 | #if !UCONFIG_NO_COLLATION |
| 15 | |
| 16 | #include "unicode/ucol.h" |
| 17 | #include "cmemory.h" |
| 18 | #include "collation.h" |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 19 | #include "collationdata.h" |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 20 | #include "collationsettings.h" |
| 21 | #include "sharedobject.h" |
| 22 | #include "uassert.h" |
| 23 | #include "umutex.h" |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 24 | #include "uvectr32.h" |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 25 | |
| 26 | U_NAMESPACE_BEGIN |
| 27 | |
| 28 | CollationSettings::CollationSettings(const CollationSettings &other) |
| 29 | : SharedObject(other), |
| 30 | options(other.options), variableTop(other.variableTop), |
| 31 | reorderTable(NULL), |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 32 | minHighNoReorder(other.minHighNoReorder), |
| 33 | reorderRanges(NULL), reorderRangesLength(0), |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 34 | reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0), |
| 35 | fastLatinOptions(other.fastLatinOptions) { |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 36 | UErrorCode errorCode = U_ZERO_ERROR; |
| 37 | copyReorderingFrom(other, errorCode); |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 38 | if(fastLatinOptions >= 0) { |
| 39 | uprv_memcpy(fastLatinPrimaries, other.fastLatinPrimaries, sizeof(fastLatinPrimaries)); |
| 40 | } |
| 41 | } |
| 42 | |
| 43 | CollationSettings::~CollationSettings() { |
| 44 | if(reorderCodesCapacity != 0) { |
| 45 | uprv_free(const_cast<int32_t *>(reorderCodes)); |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | UBool |
| 50 | CollationSettings::operator==(const CollationSettings &other) const { |
| 51 | if(options != other.options) { return FALSE; } |
| 52 | if((options & ALTERNATE_MASK) != 0 && variableTop != other.variableTop) { return FALSE; } |
| 53 | if(reorderCodesLength != other.reorderCodesLength) { return FALSE; } |
| 54 | for(int32_t i = 0; i < reorderCodesLength; ++i) { |
| 55 | if(reorderCodes[i] != other.reorderCodes[i]) { return FALSE; } |
| 56 | } |
| 57 | return TRUE; |
| 58 | } |
| 59 | |
| 60 | int32_t |
| 61 | CollationSettings::hashCode() const { |
| 62 | int32_t h = options << 8; |
| 63 | if((options & ALTERNATE_MASK) != 0) { h ^= variableTop; } |
| 64 | h ^= reorderCodesLength; |
| 65 | for(int32_t i = 0; i < reorderCodesLength; ++i) { |
| 66 | h ^= (reorderCodes[i] << i); |
| 67 | } |
| 68 | return h; |
| 69 | } |
| 70 | |
| 71 | void |
| 72 | CollationSettings::resetReordering() { |
| 73 | // When we turn off reordering, we want to set a NULL permutation |
| 74 | // rather than a no-op permutation. |
| 75 | // Keep the memory via reorderCodes and its capacity. |
| 76 | reorderTable = NULL; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 77 | minHighNoReorder = 0; |
| 78 | reorderRangesLength = 0; |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 79 | reorderCodesLength = 0; |
| 80 | } |
| 81 | |
| 82 | void |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 83 | CollationSettings::aliasReordering(const CollationData &data, const int32_t *codes, int32_t length, |
| 84 | const uint32_t *ranges, int32_t rangesLength, |
| 85 | const uint8_t *table, UErrorCode &errorCode) { |
| 86 | if(U_FAILURE(errorCode)) { return; } |
| 87 | if(table != NULL && |
| 88 | (rangesLength == 0 ? |
| 89 | !reorderTableHasSplitBytes(table) : |
| 90 | rangesLength >= 2 && |
| 91 | // The first offset must be 0. The last offset must not be 0. |
| 92 | (ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0)) { |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 93 | // We need to release the memory before setting the alias pointer. |
| 94 | if(reorderCodesCapacity != 0) { |
| 95 | uprv_free(const_cast<int32_t *>(reorderCodes)); |
| 96 | reorderCodesCapacity = 0; |
| 97 | } |
| 98 | reorderTable = table; |
| 99 | reorderCodes = codes; |
| 100 | reorderCodesLength = length; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 101 | // Drop ranges before the first split byte. They are reordered by the table. |
| 102 | // This then speeds up reordering of the remaining ranges. |
| 103 | int32_t firstSplitByteRangeIndex = 0; |
| 104 | while(firstSplitByteRangeIndex < rangesLength && |
| 105 | (ranges[firstSplitByteRangeIndex] & 0xff0000) == 0) { |
| 106 | // The second byte of the primary limit is 0. |
| 107 | ++firstSplitByteRangeIndex; |
| 108 | } |
| 109 | if(firstSplitByteRangeIndex == rangesLength) { |
| 110 | U_ASSERT(!reorderTableHasSplitBytes(table)); |
| 111 | minHighNoReorder = 0; |
| 112 | reorderRanges = NULL; |
| 113 | reorderRangesLength = 0; |
| 114 | } else { |
| 115 | U_ASSERT(table[ranges[firstSplitByteRangeIndex] >> 24] == 0); |
| 116 | minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000; |
| 117 | reorderRanges = ranges + firstSplitByteRangeIndex; |
| 118 | reorderRangesLength = rangesLength - firstSplitByteRangeIndex; |
| 119 | } |
| 120 | return; |
| 121 | } |
| 122 | // Regenerate missing data. |
| 123 | setReordering(data, codes, length, errorCode); |
| 124 | } |
| 125 | |
| 126 | void |
| 127 | CollationSettings::setReordering(const CollationData &data, |
| 128 | const int32_t *codes, int32_t codesLength, |
| 129 | UErrorCode &errorCode) { |
| 130 | if(U_FAILURE(errorCode)) { return; } |
| 131 | if(codesLength == 0 || (codesLength == 1 && codes[0] == UCOL_REORDER_CODE_NONE)) { |
| 132 | resetReordering(); |
| 133 | return; |
| 134 | } |
| 135 | UVector32 rangesList(errorCode); |
| 136 | data.makeReorderRanges(codes, codesLength, rangesList, errorCode); |
| 137 | if(U_FAILURE(errorCode)) { return; } |
| 138 | int32_t rangesLength = rangesList.size(); |
| 139 | if(rangesLength == 0) { |
| 140 | resetReordering(); |
| 141 | return; |
| 142 | } |
| 143 | const uint32_t *ranges = reinterpret_cast<uint32_t *>(rangesList.getBuffer()); |
| 144 | // ranges[] contains at least two (limit, offset) pairs. |
| 145 | // The first offset must be 0. The last offset must not be 0. |
| 146 | // Separators (at the low end) and trailing weights (at the high end) |
| 147 | // are never reordered. |
| 148 | U_ASSERT(rangesLength >= 2); |
| 149 | U_ASSERT((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0); |
| 150 | minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000; |
| 151 | |
| 152 | // Write the lead byte permutation table. |
| 153 | // Set a 0 for each lead byte that has a range boundary in the middle. |
| 154 | uint8_t table[256]; |
| 155 | int32_t b = 0; |
| 156 | int32_t firstSplitByteRangeIndex = -1; |
| 157 | for(int32_t i = 0; i < rangesLength; ++i) { |
| 158 | uint32_t pair = ranges[i]; |
| 159 | int32_t limit1 = (int32_t)(pair >> 24); |
| 160 | while(b < limit1) { |
| 161 | table[b] = (uint8_t)(b + pair); |
| 162 | ++b; |
| 163 | } |
| 164 | // Check the second byte of the limit. |
| 165 | if((pair & 0xff0000) != 0) { |
| 166 | table[limit1] = 0; |
| 167 | b = limit1 + 1; |
| 168 | if(firstSplitByteRangeIndex < 0) { |
| 169 | firstSplitByteRangeIndex = i; |
| 170 | } |
| 171 | } |
| 172 | } |
| 173 | while(b <= 0xff) { |
| 174 | table[b] = (uint8_t)b; |
| 175 | ++b; |
| 176 | } |
| 177 | if(firstSplitByteRangeIndex < 0) { |
| 178 | // The lead byte permutation table alone suffices for reordering. |
| 179 | rangesLength = 0; |
| 180 | } else { |
| 181 | // Remove the ranges below the first split byte. |
| 182 | ranges += firstSplitByteRangeIndex; |
| 183 | rangesLength -= firstSplitByteRangeIndex; |
| 184 | } |
| 185 | setReorderArrays(codes, codesLength, ranges, rangesLength, table, errorCode); |
| 186 | } |
| 187 | |
| 188 | void |
| 189 | CollationSettings::setReorderArrays(const int32_t *codes, int32_t codesLength, |
| 190 | const uint32_t *ranges, int32_t rangesLength, |
| 191 | const uint8_t *table, UErrorCode &errorCode) { |
| 192 | if(U_FAILURE(errorCode)) { return; } |
| 193 | int32_t *ownedCodes; |
| 194 | int32_t totalLength = codesLength + rangesLength; |
| 195 | U_ASSERT(totalLength > 0); |
| 196 | if(totalLength <= reorderCodesCapacity) { |
| 197 | ownedCodes = const_cast<int32_t *>(reorderCodes); |
| 198 | } else { |
| 199 | // Allocate one memory block for the codes, the ranges, and the 16-aligned table. |
| 200 | int32_t capacity = (totalLength + 3) & ~3; // round up to a multiple of 4 ints |
| 201 | ownedCodes = (int32_t *)uprv_malloc(capacity * 4 + 256); |
| 202 | if(ownedCodes == NULL) { |
| 203 | resetReordering(); |
| 204 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 205 | return; |
| 206 | } |
| 207 | if(reorderCodesCapacity != 0) { |
| 208 | uprv_free(const_cast<int32_t *>(reorderCodes)); |
| 209 | } |
| 210 | reorderCodes = ownedCodes; |
| 211 | reorderCodesCapacity = capacity; |
| 212 | } |
| 213 | uprv_memcpy(ownedCodes + reorderCodesCapacity, table, 256); |
| 214 | uprv_memcpy(ownedCodes, codes, codesLength * 4); |
| 215 | uprv_memcpy(ownedCodes + codesLength, ranges, rangesLength * 4); |
| 216 | reorderTable = reinterpret_cast<const uint8_t *>(reorderCodes + reorderCodesCapacity); |
| 217 | reorderCodesLength = codesLength; |
| 218 | reorderRanges = reinterpret_cast<uint32_t *>(ownedCodes) + codesLength; |
| 219 | reorderRangesLength = rangesLength; |
| 220 | } |
| 221 | |
| 222 | void |
| 223 | CollationSettings::copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode) { |
| 224 | if(U_FAILURE(errorCode)) { return; } |
| 225 | if(!other.hasReordering()) { |
| 226 | resetReordering(); |
| 227 | return; |
| 228 | } |
| 229 | minHighNoReorder = other.minHighNoReorder; |
| 230 | if(other.reorderCodesCapacity == 0) { |
| 231 | // The reorder arrays are aliased to memory-mapped data. |
| 232 | reorderTable = other.reorderTable; |
| 233 | reorderRanges = other.reorderRanges; |
| 234 | reorderRangesLength = other.reorderRangesLength; |
| 235 | reorderCodes = other.reorderCodes; |
| 236 | reorderCodesLength = other.reorderCodesLength; |
| 237 | } else { |
| 238 | setReorderArrays(other.reorderCodes, other.reorderCodesLength, |
| 239 | other.reorderRanges, other.reorderRangesLength, |
| 240 | other.reorderTable, errorCode); |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 241 | } |
| 242 | } |
| 243 | |
| 244 | UBool |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 245 | CollationSettings::reorderTableHasSplitBytes(const uint8_t table[256]) { |
| 246 | U_ASSERT(table[0] == 0); |
| 247 | for(int32_t i = 1; i < 256; ++i) { |
| 248 | if(table[i] == 0) { |
| 249 | return TRUE; |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 250 | } |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 251 | } |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 252 | return FALSE; |
| 253 | } |
| 254 | |
| 255 | uint32_t |
| 256 | CollationSettings::reorderEx(uint32_t p) const { |
| 257 | if(p >= minHighNoReorder) { return p; } |
| 258 | // Round up p so that its lower 16 bits are >= any offset bits. |
| 259 | // Then compare q directly with (limit, offset) pairs. |
| 260 | uint32_t q = p | 0xffff; |
| 261 | uint32_t r; |
| 262 | const uint32_t *ranges = reorderRanges; |
| 263 | while(q >= (r = *ranges)) { ++ranges; } |
| 264 | return p + (r << 24); |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 265 | } |
| 266 | |
| 267 | void |
| 268 | CollationSettings::setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode) { |
| 269 | if(U_FAILURE(errorCode)) { return; } |
| 270 | int32_t noStrength = options & ~STRENGTH_MASK; |
| 271 | switch(value) { |
| 272 | case UCOL_PRIMARY: |
| 273 | case UCOL_SECONDARY: |
| 274 | case UCOL_TERTIARY: |
| 275 | case UCOL_QUATERNARY: |
| 276 | case UCOL_IDENTICAL: |
| 277 | options = noStrength | (value << STRENGTH_SHIFT); |
| 278 | break; |
| 279 | case UCOL_DEFAULT: |
| 280 | options = noStrength | (defaultOptions & STRENGTH_MASK); |
| 281 | break; |
| 282 | default: |
| 283 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 284 | break; |
| 285 | } |
| 286 | } |
| 287 | |
| 288 | void |
| 289 | CollationSettings::setFlag(int32_t bit, UColAttributeValue value, |
| 290 | int32_t defaultOptions, UErrorCode &errorCode) { |
| 291 | if(U_FAILURE(errorCode)) { return; } |
| 292 | switch(value) { |
| 293 | case UCOL_ON: |
| 294 | options |= bit; |
| 295 | break; |
| 296 | case UCOL_OFF: |
| 297 | options &= ~bit; |
| 298 | break; |
| 299 | case UCOL_DEFAULT: |
| 300 | options = (options & ~bit) | (defaultOptions & bit); |
| 301 | break; |
| 302 | default: |
| 303 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 304 | break; |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | void |
| 309 | CollationSettings::setCaseFirst(UColAttributeValue value, |
| 310 | int32_t defaultOptions, UErrorCode &errorCode) { |
| 311 | if(U_FAILURE(errorCode)) { return; } |
| 312 | int32_t noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK; |
| 313 | switch(value) { |
| 314 | case UCOL_OFF: |
| 315 | options = noCaseFirst; |
| 316 | break; |
| 317 | case UCOL_LOWER_FIRST: |
| 318 | options = noCaseFirst | CASE_FIRST; |
| 319 | break; |
| 320 | case UCOL_UPPER_FIRST: |
| 321 | options = noCaseFirst | CASE_FIRST_AND_UPPER_MASK; |
| 322 | break; |
| 323 | case UCOL_DEFAULT: |
| 324 | options = noCaseFirst | (defaultOptions & CASE_FIRST_AND_UPPER_MASK); |
| 325 | break; |
| 326 | default: |
| 327 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 328 | break; |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | void |
| 333 | CollationSettings::setAlternateHandling(UColAttributeValue value, |
| 334 | int32_t defaultOptions, UErrorCode &errorCode) { |
| 335 | if(U_FAILURE(errorCode)) { return; } |
| 336 | int32_t noAlternate = options & ~ALTERNATE_MASK; |
| 337 | switch(value) { |
| 338 | case UCOL_NON_IGNORABLE: |
| 339 | options = noAlternate; |
| 340 | break; |
| 341 | case UCOL_SHIFTED: |
| 342 | options = noAlternate | SHIFTED; |
| 343 | break; |
| 344 | case UCOL_DEFAULT: |
| 345 | options = noAlternate | (defaultOptions & ALTERNATE_MASK); |
| 346 | break; |
| 347 | default: |
| 348 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 349 | break; |
| 350 | } |
| 351 | } |
| 352 | |
| 353 | void |
| 354 | CollationSettings::setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode) { |
| 355 | if(U_FAILURE(errorCode)) { return; } |
| 356 | int32_t noMax = options & ~MAX_VARIABLE_MASK; |
| 357 | switch(value) { |
| 358 | case MAX_VAR_SPACE: |
| 359 | case MAX_VAR_PUNCT: |
| 360 | case MAX_VAR_SYMBOL: |
| 361 | case MAX_VAR_CURRENCY: |
| 362 | options = noMax | (value << MAX_VARIABLE_SHIFT); |
| 363 | break; |
| 364 | case UCOL_DEFAULT: |
| 365 | options = noMax | (defaultOptions & MAX_VARIABLE_MASK); |
| 366 | break; |
| 367 | default: |
| 368 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 369 | break; |
| 370 | } |
| 371 | } |
| 372 | |
| 373 | U_NAMESPACE_END |
| 374 | |
| 375 | #endif // !UCONFIG_NO_COLLATION |