Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 1 | /* |
| 2 | ******************************************************************************* |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 3 | * Copyright (C) 2013-2015, International Business Machines |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 4 | * Corporation and others. All Rights Reserved. |
| 5 | ******************************************************************************* |
| 6 | * collationdatawriter.cpp |
| 7 | * |
| 8 | * created on: 2013aug06 |
| 9 | * created by: Markus W. Scherer |
| 10 | */ |
| 11 | |
| 12 | #include "unicode/utypes.h" |
| 13 | |
| 14 | #if !UCONFIG_NO_COLLATION |
| 15 | |
| 16 | #include "unicode/tblcoll.h" |
| 17 | #include "unicode/udata.h" |
| 18 | #include "unicode/uniset.h" |
| 19 | #include "cmemory.h" |
| 20 | #include "collationdata.h" |
| 21 | #include "collationdatabuilder.h" |
| 22 | #include "collationdatareader.h" |
| 23 | #include "collationdatawriter.h" |
| 24 | #include "collationfastlatin.h" |
| 25 | #include "collationsettings.h" |
| 26 | #include "collationtailoring.h" |
| 27 | #include "uassert.h" |
| 28 | #include "ucmndata.h" |
| 29 | |
| 30 | U_NAMESPACE_BEGIN |
| 31 | |
| 32 | uint8_t * |
| 33 | RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { |
| 34 | if(U_FAILURE(errorCode)) { return NULL; } |
| 35 | LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000)); |
| 36 | if(buffer.isNull()) { |
| 37 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 38 | return NULL; |
| 39 | } |
| 40 | length = cloneBinary(buffer.getAlias(), 20000, errorCode); |
| 41 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
| 42 | if(buffer.allocateInsteadAndCopy(length, 0) == NULL) { |
| 43 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 44 | return NULL; |
| 45 | } |
| 46 | errorCode = U_ZERO_ERROR; |
| 47 | length = cloneBinary(buffer.getAlias(), length, errorCode); |
| 48 | } |
| 49 | if(U_FAILURE(errorCode)) { return NULL; } |
| 50 | return buffer.orphan(); |
| 51 | } |
| 52 | |
| 53 | int32_t |
| 54 | RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { |
| 55 | int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; |
| 56 | return CollationDataWriter::writeTailoring( |
| 57 | *tailoring, *settings, indexes, dest, capacity, |
| 58 | errorCode); |
| 59 | } |
| 60 | |
| 61 | static const UDataInfo dataInfo = { |
| 62 | sizeof(UDataInfo), |
| 63 | 0, |
| 64 | |
| 65 | U_IS_BIG_ENDIAN, |
| 66 | U_CHARSET_FAMILY, |
| 67 | U_SIZEOF_UCHAR, |
| 68 | 0, |
| 69 | |
| 70 | { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 71 | { 5, 0, 0, 0 }, // formatVersion |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 72 | { 6, 3, 0, 0 } // dataVersion |
| 73 | }; |
| 74 | |
| 75 | int32_t |
| 76 | CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings, |
| 77 | const void *rootElements, int32_t rootElementsLength, |
| 78 | int32_t indexes[], uint8_t *dest, int32_t capacity, |
| 79 | UErrorCode &errorCode) { |
| 80 | return write(TRUE, NULL, |
| 81 | data, settings, |
| 82 | rootElements, rootElementsLength, |
| 83 | indexes, dest, capacity, errorCode); |
| 84 | } |
| 85 | |
| 86 | int32_t |
| 87 | CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings, |
| 88 | int32_t indexes[], uint8_t *dest, int32_t capacity, |
| 89 | UErrorCode &errorCode) { |
| 90 | return write(FALSE, t.version, |
| 91 | *t.data, settings, |
| 92 | NULL, 0, |
| 93 | indexes, dest, capacity, errorCode); |
| 94 | } |
| 95 | |
| 96 | int32_t |
| 97 | CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, |
| 98 | const CollationData &data, const CollationSettings &settings, |
| 99 | const void *rootElements, int32_t rootElementsLength, |
| 100 | int32_t indexes[], uint8_t *dest, int32_t capacity, |
| 101 | UErrorCode &errorCode) { |
| 102 | if(U_FAILURE(errorCode)) { return 0; } |
| 103 | if(capacity < 0 || (capacity > 0 && dest == NULL)) { |
| 104 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 105 | return 0; |
| 106 | } |
| 107 | |
| 108 | // Figure out which data items to write before settling on |
| 109 | // the indexes length and writing offsets. |
| 110 | // For any data item, we need to write the start and limit offsets, |
| 111 | // so the indexes length must be at least index-of-start-offset + 2. |
| 112 | int32_t indexesLength; |
| 113 | UBool hasMappings; |
| 114 | UnicodeSet unsafeBackwardSet; |
| 115 | const CollationData *baseData = data.base; |
| 116 | |
| 117 | int32_t fastLatinVersion; |
| 118 | if(data.fastLatinTable != NULL) { |
| 119 | fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16; |
| 120 | } else { |
| 121 | fastLatinVersion = 0; |
| 122 | } |
| 123 | int32_t fastLatinTableLength = 0; |
| 124 | |
| 125 | if(isBase) { |
| 126 | // For the root collator, we write an even number of indexes |
| 127 | // so that we start with an 8-aligned offset. |
| 128 | indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; |
| 129 | U_ASSERT(settings.reorderCodesLength == 0); |
| 130 | hasMappings = TRUE; |
| 131 | unsafeBackwardSet = *data.unsafeBackwardSet; |
| 132 | fastLatinTableLength = data.fastLatinTableLength; |
| 133 | } else if(baseData == NULL) { |
| 134 | hasMappings = FALSE; |
| 135 | if(settings.reorderCodesLength == 0) { |
| 136 | // only options |
| 137 | indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here |
| 138 | } else { |
| 139 | // only options, reorder codes, and the reorder table |
| 140 | indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; |
| 141 | } |
| 142 | } else { |
| 143 | hasMappings = TRUE; |
| 144 | // Tailored mappings, and what else? |
| 145 | // Check in ascending order of optional tailoring data items. |
| 146 | indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; |
| 147 | if(data.contextsLength != 0) { |
| 148 | indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; |
| 149 | } |
| 150 | unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); |
| 151 | if(!unsafeBackwardSet.isEmpty()) { |
| 152 | indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; |
| 153 | } |
| 154 | if(data.fastLatinTable != baseData->fastLatinTable) { |
| 155 | fastLatinTableLength = data.fastLatinTableLength; |
| 156 | indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; |
| 157 | } |
| 158 | } |
| 159 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 160 | UVector32 codesAndRanges(errorCode); |
| 161 | const int32_t *reorderCodes = settings.reorderCodes; |
| 162 | int32_t reorderCodesLength = settings.reorderCodesLength; |
| 163 | if(settings.hasReordering() && |
| 164 | CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { |
| 165 | // Rebuild the full list of reorder ranges. |
| 166 | // The list in the settings is truncated for efficiency. |
| 167 | data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode); |
| 168 | // Write the codes, then the ranges. |
| 169 | for(int32_t i = 0; i < reorderCodesLength; ++i) { |
| 170 | codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode); |
| 171 | } |
| 172 | if(U_FAILURE(errorCode)) { return 0; } |
| 173 | reorderCodes = codesAndRanges.getBuffer(); |
| 174 | reorderCodesLength = codesAndRanges.size(); |
| 175 | } |
| 176 | |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 177 | int32_t headerSize; |
| 178 | if(isBase) { |
| 179 | headerSize = 0; // udata_create() writes the header |
| 180 | } else { |
| 181 | DataHeader header; |
| 182 | header.dataHeader.magic1 = 0xda; |
| 183 | header.dataHeader.magic2 = 0x27; |
| 184 | uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); |
| 185 | uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); |
| 186 | headerSize = (int32_t)sizeof(header); |
| 187 | U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes |
| 188 | if(hasMappings && data.cesLength != 0) { |
| 189 | // Sum of the sizes of the data items which are |
| 190 | // not automatically multiples of 8 bytes and which are placed before the CEs. |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 191 | int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4; |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 192 | if((sum & 7) != 0) { |
| 193 | // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. |
| 194 | // We add to the header size here. |
| 195 | // Alternatively, we could increment the indexesLength |
| 196 | // or add a few bytes to the reorderTable. |
| 197 | headerSize += 4; |
| 198 | } |
| 199 | } |
| 200 | header.dataHeader.headerSize = (uint16_t)headerSize; |
| 201 | if(headerSize <= capacity) { |
| 202 | uprv_memcpy(dest, &header, sizeof(header)); |
| 203 | // Write 00 bytes so that the padding is not mistaken for a copyright string. |
| 204 | uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); |
| 205 | dest += headerSize; |
| 206 | capacity -= headerSize; |
| 207 | } else { |
| 208 | dest = NULL; |
| 209 | capacity = 0; |
| 210 | } |
| 211 | } |
| 212 | |
| 213 | indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; |
| 214 | U_ASSERT((settings.options & ~0xffff) == 0); |
| 215 | indexes[CollationDataReader::IX_OPTIONS] = |
| 216 | data.numericPrimary | fastLatinVersion | settings.options; |
| 217 | indexes[CollationDataReader::IX_RESERVED2] = 0; |
| 218 | indexes[CollationDataReader::IX_RESERVED3] = 0; |
| 219 | |
| 220 | // Byte offsets of data items all start from the start of the indexes. |
| 221 | // We add the headerSize at the very end. |
| 222 | int32_t totalSize = indexesLength * 4; |
| 223 | |
| 224 | if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { |
| 225 | indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s; |
| 226 | } else { |
| 227 | indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; |
| 228 | } |
| 229 | |
| 230 | indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 231 | totalSize += reorderCodesLength * 4; |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 232 | |
| 233 | indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; |
| 234 | if(settings.reorderTable != NULL) { |
| 235 | totalSize += 256; |
| 236 | } |
| 237 | |
| 238 | indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; |
| 239 | if(hasMappings) { |
| 240 | UErrorCode errorCode2 = U_ZERO_ERROR; |
| 241 | int32_t length; |
| 242 | if(totalSize < capacity) { |
| 243 | length = utrie2_serialize(data.trie, dest + totalSize, |
| 244 | capacity - totalSize, &errorCode2); |
| 245 | } else { |
| 246 | length = utrie2_serialize(data.trie, NULL, 0, &errorCode2); |
| 247 | } |
| 248 | if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { |
| 249 | errorCode = errorCode2; |
| 250 | return 0; |
| 251 | } |
| 252 | // The trie size should be a multiple of 8 bytes due to the way |
| 253 | // compactIndex2(UNewTrie2 *trie) currently works. |
| 254 | U_ASSERT((length & 7) == 0); |
| 255 | totalSize += length; |
| 256 | } |
| 257 | |
| 258 | indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; |
| 259 | indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; |
| 260 | if(hasMappings && data.cesLength != 0) { |
| 261 | U_ASSERT(((headerSize + totalSize) & 7) == 0); |
| 262 | totalSize += data.cesLength * 8; |
| 263 | } |
| 264 | |
| 265 | indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; |
| 266 | indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; |
| 267 | if(hasMappings) { |
| 268 | totalSize += data.ce32sLength * 4; |
| 269 | } |
| 270 | |
| 271 | indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; |
| 272 | totalSize += rootElementsLength * 4; |
| 273 | |
| 274 | indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; |
| 275 | if(hasMappings) { |
| 276 | totalSize += data.contextsLength * 2; |
| 277 | } |
| 278 | |
| 279 | indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; |
| 280 | if(hasMappings && !unsafeBackwardSet.isEmpty()) { |
| 281 | UErrorCode errorCode2 = U_ZERO_ERROR; |
| 282 | int32_t length; |
| 283 | if(totalSize < capacity) { |
| 284 | uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); |
| 285 | length = unsafeBackwardSet.serialize( |
| 286 | p, (capacity - totalSize) / 2, errorCode2); |
| 287 | } else { |
| 288 | length = unsafeBackwardSet.serialize(NULL, 0, errorCode2); |
| 289 | } |
| 290 | if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { |
| 291 | errorCode = errorCode2; |
| 292 | return 0; |
| 293 | } |
| 294 | totalSize += length * 2; |
| 295 | } |
| 296 | |
| 297 | indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; |
| 298 | totalSize += fastLatinTableLength * 2; |
| 299 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 300 | UnicodeString scripts; |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 301 | indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; |
| 302 | if(isBase) { |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 303 | scripts.append((UChar)data.numScripts); |
| 304 | scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16); |
| 305 | scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength); |
| 306 | totalSize += scripts.length() * 2; |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 307 | } |
| 308 | |
| 309 | indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; |
| 310 | if(isBase) { |
| 311 | totalSize += 256; |
| 312 | } |
| 313 | |
| 314 | indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; |
| 315 | indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; |
| 316 | |
| 317 | if(totalSize > capacity) { |
| 318 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 319 | return headerSize + totalSize; |
| 320 | } |
| 321 | |
| 322 | uprv_memcpy(dest, indexes, indexesLength * 4); |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 323 | copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest); |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 324 | copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); |
| 325 | // The trie has already been serialized into the dest buffer. |
| 326 | copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); |
| 327 | copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); |
| 328 | copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); |
| 329 | copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); |
| 330 | // The unsafeBackwardSet has already been serialized into the dest buffer. |
| 331 | copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame^] | 332 | copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest); |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 333 | copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); |
| 334 | |
| 335 | return headerSize + totalSize; |
| 336 | } |
| 337 | |
| 338 | void |
| 339 | CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex, |
| 340 | const void *src, uint8_t *dest) { |
| 341 | int32_t start = indexes[startIndex]; |
| 342 | int32_t limit = indexes[startIndex + 1]; |
| 343 | if(start < limit) { |
| 344 | uprv_memcpy(dest + start, src, limit - start); |
| 345 | } |
| 346 | } |
| 347 | |
| 348 | U_NAMESPACE_END |
| 349 | |
| 350 | #endif // !UCONFIG_NO_COLLATION |