blob: 519b5422ef6588dfc237323592245912d6955012 [file] [log] [blame]
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001/*
2*******************************************************************************
3* Copyright (C) 2013-2014, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* collationdatareader.cpp
7*
8* created on: 2013feb07
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/ucol.h"
17#include "unicode/udata.h"
18#include "unicode/uscript.h"
19#include "cmemory.h"
20#include "collation.h"
21#include "collationdata.h"
22#include "collationdatareader.h"
23#include "collationfastlatin.h"
24#include "collationkeys.h"
25#include "collationrootelements.h"
26#include "collationsettings.h"
27#include "collationtailoring.h"
28#include "normalizer2impl.h"
29#include "uassert.h"
30#include "ucmndata.h"
31#include "utrie2.h"
32
33U_NAMESPACE_BEGIN
34
35namespace {
36
37int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
38 return (i < length) ? indexes[i] : -1;
39}
40
41} // namespace
42
43void
44CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
45 CollationTailoring &tailoring, UErrorCode &errorCode) {
46 if(U_FAILURE(errorCode)) { return; }
47 if(base != NULL) {
48 if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
49 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
50 return;
51 }
52 const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
53 if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
54 isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
55 errorCode = U_INVALID_FORMAT_ERROR;
56 return;
57 }
58 if(base->getUCAVersion() != tailoring.getUCAVersion()) {
59 errorCode = U_COLLATOR_VERSION_MISMATCH;
60 return;
61 }
62 int32_t headerLength = header->dataHeader.headerSize;
63 inBytes += headerLength;
64 if(inLength >= 0) {
65 inLength -= headerLength;
66 }
67 }
68
69 if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
70 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
71 return;
72 }
73 const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
74 int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
75 if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
76 errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
77 return;
78 }
79
80 // Assume that the tailoring data is in initial state,
81 // with NULL pointers and 0 lengths.
82
83 // Set pointers to non-empty data parts.
84 // Do this in order of their byte offsets. (Should help porting to Java.)
85
86 int32_t index; // one of the indexes[] slots
87 int32_t offset; // byte offset for the index part
88 int32_t length; // number of bytes in the index part
89
90 if(indexesLength > IX_TOTAL_SIZE) {
91 length = inIndexes[IX_TOTAL_SIZE];
92 } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
93 length = inIndexes[indexesLength - 1];
94 } else {
95 length = 0; // only indexes, and inLength was already checked for them
96 }
97 if(0 <= inLength && inLength < length) {
98 errorCode = U_INVALID_FORMAT_ERROR;
99 return;
100 }
101
102 const CollationData *baseData = base == NULL ? NULL : base->data;
103 const int32_t *reorderCodes = NULL;
104 int32_t reorderCodesLength = 0;
105 index = IX_REORDER_CODES_OFFSET;
106 offset = getIndex(inIndexes, indexesLength, index);
107 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
108 if(length >= 4) {
109 if(baseData == NULL) {
110 // We assume for collation settings that
111 // the base data does not have a reordering.
112 errorCode = U_INVALID_FORMAT_ERROR;
113 return;
114 }
115 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
116 reorderCodesLength = length / 4;
117 }
118
119 // There should be a reorder table only if there are reorder codes.
120 // However, when there are reorder codes the reorder table may be omitted to reduce
121 // the data size.
122 const uint8_t *reorderTable = NULL;
123 index = IX_REORDER_TABLE_OFFSET;
124 offset = getIndex(inIndexes, indexesLength, index);
125 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
126 if(length >= 256) {
127 if(reorderCodesLength == 0) {
128 errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.
129 return;
130 }
131 reorderTable = inBytes + offset;
132 } else {
133 // If we have reorder codes, then build the reorderTable at the end,
134 // when the CollationData is otherwise complete.
135 }
136
137 if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
138 errorCode = U_INVALID_FORMAT_ERROR;
139 return;
140 }
141 CollationData *data = NULL; // Remains NULL if there are no mappings.
142
143 index = IX_TRIE_OFFSET;
144 offset = getIndex(inIndexes, indexesLength, index);
145 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
146 if(length >= 8) {
147 if(!tailoring.ensureOwnedData(errorCode)) { return; }
148 data = tailoring.ownedData;
149 data->base = baseData;
150 data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
151 data->trie = tailoring.trie = utrie2_openFromSerialized(
152 UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
153 &errorCode);
154 if(U_FAILURE(errorCode)) { return; }
155 } else if(baseData != NULL) {
156 // Use the base data. Only the settings are tailored.
157 tailoring.data = baseData;
158 } else {
159 errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
160 return;
161 }
162
163 index = IX_CES_OFFSET;
164 offset = getIndex(inIndexes, indexesLength, index);
165 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
166 if(length >= 8) {
167 if(data == NULL) {
168 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.
169 return;
170 }
171 data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
172 data->cesLength = length / 8;
173 }
174
175 index = IX_CE32S_OFFSET;
176 offset = getIndex(inIndexes, indexesLength, index);
177 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
178 if(length >= 4) {
179 if(data == NULL) {
180 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.
181 return;
182 }
183 data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
184 data->ce32sLength = length / 4;
185 }
186
187 int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
188 if(jamoCE32sStart >= 0) {
189 if(data == NULL || data->ce32s == NULL) {
190 errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].
191 return;
192 }
193 data->jamoCE32s = data->ce32s + jamoCE32sStart;
194 } else if(data == NULL) {
195 // Nothing to do.
196 } else if(baseData != NULL) {
197 data->jamoCE32s = baseData->jamoCE32s;
198 } else {
199 errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.
200 return;
201 }
202
203 index = IX_ROOT_ELEMENTS_OFFSET;
204 offset = getIndex(inIndexes, indexesLength, index);
205 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
206 if(length >= 4) {
207 length /= 4;
208 if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
209 errorCode = U_INVALID_FORMAT_ERROR;
210 return;
211 }
212 data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
213 data->rootElementsLength = length;
214 uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
215 if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
216 errorCode = U_INVALID_FORMAT_ERROR;
217 return;
218 }
219 uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
220 if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
221 // [fixed last secondary common byte] is too low,
222 // and secondary weights would collide with compressed common secondaries.
223 errorCode = U_INVALID_FORMAT_ERROR;
224 return;
225 }
226 }
227
228 index = IX_CONTEXTS_OFFSET;
229 offset = getIndex(inIndexes, indexesLength, index);
230 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
231 if(length >= 2) {
232 if(data == NULL) {
233 errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.
234 return;
235 }
236 data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
237 data->contextsLength = length / 2;
238 }
239
240 index = IX_UNSAFE_BWD_OFFSET;
241 offset = getIndex(inIndexes, indexesLength, index);
242 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
243 if(length >= 2) {
244 if(data == NULL) {
245 errorCode = U_INVALID_FORMAT_ERROR;
246 return;
247 }
248 if(baseData == NULL) {
249 // Create the unsafe-backward set for the root collator.
250 // Include all non-zero combining marks and trail surrogates.
251 // We do this at load time, rather than at build time,
252 // to simplify Unicode version bootstrapping:
253 // The root data builder only needs the new FractionalUCA.txt data,
254 // but it need not be built with a version of ICU already updated to
255 // the corresponding new Unicode Character Database.
256 //
257 // The following is an optimized version of
258 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
259 // It is faster and requires fewer code dependencies.
260 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
261 if(tailoring.unsafeBackwardSet == NULL) {
262 errorCode = U_MEMORY_ALLOCATION_ERROR;
263 return;
264 }
265 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
266 } else {
267 // Clone the root collator's set contents.
268 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
269 baseData->unsafeBackwardSet->cloneAsThawed());
270 if(tailoring.unsafeBackwardSet == NULL) {
271 errorCode = U_MEMORY_ALLOCATION_ERROR;
272 return;
273 }
274 }
275 // Add the ranges from the data file to the unsafe-backward set.
276 USerializedSet sset;
277 const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
278 if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
279 errorCode = U_INVALID_FORMAT_ERROR;
280 return;
281 }
282 int32_t count = uset_getSerializedRangeCount(&sset);
283 for(int32_t i = 0; i < count; ++i) {
284 UChar32 start, end;
285 uset_getSerializedRange(&sset, i, &start, &end);
286 tailoring.unsafeBackwardSet->add(start, end);
287 }
288 // Mark each lead surrogate as "unsafe"
289 // if any of its 1024 associated supplementary code points is "unsafe".
290 UChar32 c = 0x10000;
291 for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
292 if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
293 tailoring.unsafeBackwardSet->add(lead);
294 }
295 }
296 tailoring.unsafeBackwardSet->freeze();
297 data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
298 } else if(data == NULL) {
299 // Nothing to do.
300 } else if(baseData != NULL) {
301 // No tailoring-specific data: Alias the root collator's set.
302 data->unsafeBackwardSet = baseData->unsafeBackwardSet;
303 } else {
304 errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
305 return;
306 }
307
308 // If the fast Latin format version is different,
309 // or the version is set to 0 for "no fast Latin table",
310 // then just always use the normal string comparison path.
311 if(data != NULL) {
312 data->fastLatinTable = NULL;
313 data->fastLatinTableLength = 0;
314 if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
315 index = IX_FAST_LATIN_TABLE_OFFSET;
316 offset = getIndex(inIndexes, indexesLength, index);
317 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
318 if(length >= 2) {
319 data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
320 data->fastLatinTableLength = length / 2;
321 if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
322 errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch
323 return;
324 }
325 } else if(baseData != NULL) {
326 data->fastLatinTable = baseData->fastLatinTable;
327 data->fastLatinTableLength = baseData->fastLatinTableLength;
328 }
329 }
330 }
331
332 index = IX_SCRIPTS_OFFSET;
333 offset = getIndex(inIndexes, indexesLength, index);
334 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
335 if(length >= 2) {
336 if(data == NULL) {
337 errorCode = U_INVALID_FORMAT_ERROR;
338 return;
339 }
340 data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
341 data->scriptsLength = length / 2;
342 } else if(data == NULL) {
343 // Nothing to do.
344 } else if(baseData != NULL) {
345 data->scripts = baseData->scripts;
346 data->scriptsLength = baseData->scriptsLength;
347 }
348
349 index = IX_COMPRESSIBLE_BYTES_OFFSET;
350 offset = getIndex(inIndexes, indexesLength, index);
351 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
352 if(length >= 256) {
353 if(data == NULL) {
354 errorCode = U_INVALID_FORMAT_ERROR;
355 return;
356 }
357 data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
358 } else if(data == NULL) {
359 // Nothing to do.
360 } else if(baseData != NULL) {
361 data->compressibleBytes = baseData->compressibleBytes;
362 } else {
363 errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
364 return;
365 }
366
367 const CollationSettings &ts = *tailoring.settings;
368 int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
369 uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
370 int32_t fastLatinOptions = CollationFastLatin::getOptions(
371 tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
372 if(options == ts.options && ts.variableTop != 0 &&
373 reorderCodesLength == ts.reorderCodesLength &&
374 uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 &&
375 fastLatinOptions == ts.fastLatinOptions &&
376 (fastLatinOptions < 0 ||
377 uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
378 sizeof(fastLatinPrimaries)) == 0)) {
379 return;
380 }
381
382 CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
383 if(settings == NULL) {
384 errorCode = U_MEMORY_ALLOCATION_ERROR;
385 return;
386 }
387 settings->options = options;
388 // Set variableTop from options and scripts data.
389 settings->variableTop = tailoring.data->getLastPrimaryForGroup(
390 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
391 if(settings->variableTop == 0) {
392 errorCode = U_INVALID_FORMAT_ERROR;
393 return;
394 }
395
396 if(reorderCodesLength == 0 || reorderTable != NULL) {
397 settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable);
398 } else {
399 uint8_t table[256];
400 baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, errorCode);
401 if(U_FAILURE(errorCode)) { return; }
402 if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
403 errorCode = U_MEMORY_ALLOCATION_ERROR;
404 return;
405 }
406 }
407
408 settings->fastLatinOptions = CollationFastLatin::getOptions(
409 tailoring.data, *settings,
410 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
411}
412
413UBool U_CALLCONV
414CollationDataReader::isAcceptable(void *context,
415 const char * /* type */, const char * /*name*/,
416 const UDataInfo *pInfo) {
417 if(
418 pInfo->size >= 20 &&
419 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
420 pInfo->charsetFamily == U_CHARSET_FAMILY &&
421 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
422 pInfo->dataFormat[1] == 0x43 &&
423 pInfo->dataFormat[2] == 0x6f &&
424 pInfo->dataFormat[3] == 0x6c &&
425 pInfo->formatVersion[0] == 4
426 ) {
427 UVersionInfo *version = static_cast<UVersionInfo *>(context);
428 if(version != NULL) {
429 uprv_memcpy(version, pInfo->dataVersion, 4);
430 }
431 return TRUE;
432 } else {
433 return FALSE;
434 }
435}
436
437U_NAMESPACE_END
438
439#endif // !UCONFIG_NO_COLLATION