Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame^] | 1 | /* |
| 2 | ******************************************************************************* |
| 3 | * Copyright (C) 2013-2014, International Business Machines |
| 4 | * Corporation and others. All Rights Reserved. |
| 5 | ******************************************************************************* |
| 6 | * collationdatawriter.cpp |
| 7 | * |
| 8 | * created on: 2013aug06 |
| 9 | * created by: Markus W. Scherer |
| 10 | */ |
| 11 | |
| 12 | #include "unicode/utypes.h" |
| 13 | |
| 14 | #if !UCONFIG_NO_COLLATION |
| 15 | |
| 16 | #include "unicode/tblcoll.h" |
| 17 | #include "unicode/udata.h" |
| 18 | #include "unicode/uniset.h" |
| 19 | #include "cmemory.h" |
| 20 | #include "collationdata.h" |
| 21 | #include "collationdatabuilder.h" |
| 22 | #include "collationdatareader.h" |
| 23 | #include "collationdatawriter.h" |
| 24 | #include "collationfastlatin.h" |
| 25 | #include "collationsettings.h" |
| 26 | #include "collationtailoring.h" |
| 27 | #include "uassert.h" |
| 28 | #include "ucmndata.h" |
| 29 | |
| 30 | U_NAMESPACE_BEGIN |
| 31 | |
| 32 | uint8_t * |
| 33 | RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { |
| 34 | if(U_FAILURE(errorCode)) { return NULL; } |
| 35 | LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000)); |
| 36 | if(buffer.isNull()) { |
| 37 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 38 | return NULL; |
| 39 | } |
| 40 | length = cloneBinary(buffer.getAlias(), 20000, errorCode); |
| 41 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
| 42 | if(buffer.allocateInsteadAndCopy(length, 0) == NULL) { |
| 43 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 44 | return NULL; |
| 45 | } |
| 46 | errorCode = U_ZERO_ERROR; |
| 47 | length = cloneBinary(buffer.getAlias(), length, errorCode); |
| 48 | } |
| 49 | if(U_FAILURE(errorCode)) { return NULL; } |
| 50 | return buffer.orphan(); |
| 51 | } |
| 52 | |
| 53 | int32_t |
| 54 | RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { |
| 55 | int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; |
| 56 | return CollationDataWriter::writeTailoring( |
| 57 | *tailoring, *settings, indexes, dest, capacity, |
| 58 | errorCode); |
| 59 | } |
| 60 | |
| 61 | static const UDataInfo dataInfo = { |
| 62 | sizeof(UDataInfo), |
| 63 | 0, |
| 64 | |
| 65 | U_IS_BIG_ENDIAN, |
| 66 | U_CHARSET_FAMILY, |
| 67 | U_SIZEOF_UCHAR, |
| 68 | 0, |
| 69 | |
| 70 | { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" |
| 71 | { 4, 0, 0, 0 }, // formatVersion |
| 72 | { 6, 3, 0, 0 } // dataVersion |
| 73 | }; |
| 74 | |
| 75 | int32_t |
| 76 | CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings, |
| 77 | const void *rootElements, int32_t rootElementsLength, |
| 78 | int32_t indexes[], uint8_t *dest, int32_t capacity, |
| 79 | UErrorCode &errorCode) { |
| 80 | return write(TRUE, NULL, |
| 81 | data, settings, |
| 82 | rootElements, rootElementsLength, |
| 83 | indexes, dest, capacity, errorCode); |
| 84 | } |
| 85 | |
| 86 | int32_t |
| 87 | CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings, |
| 88 | int32_t indexes[], uint8_t *dest, int32_t capacity, |
| 89 | UErrorCode &errorCode) { |
| 90 | return write(FALSE, t.version, |
| 91 | *t.data, settings, |
| 92 | NULL, 0, |
| 93 | indexes, dest, capacity, errorCode); |
| 94 | } |
| 95 | |
| 96 | int32_t |
| 97 | CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, |
| 98 | const CollationData &data, const CollationSettings &settings, |
| 99 | const void *rootElements, int32_t rootElementsLength, |
| 100 | int32_t indexes[], uint8_t *dest, int32_t capacity, |
| 101 | UErrorCode &errorCode) { |
| 102 | if(U_FAILURE(errorCode)) { return 0; } |
| 103 | if(capacity < 0 || (capacity > 0 && dest == NULL)) { |
| 104 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 105 | return 0; |
| 106 | } |
| 107 | |
| 108 | // Figure out which data items to write before settling on |
| 109 | // the indexes length and writing offsets. |
| 110 | // For any data item, we need to write the start and limit offsets, |
| 111 | // so the indexes length must be at least index-of-start-offset + 2. |
| 112 | int32_t indexesLength; |
| 113 | UBool hasMappings; |
| 114 | UnicodeSet unsafeBackwardSet; |
| 115 | const CollationData *baseData = data.base; |
| 116 | |
| 117 | int32_t fastLatinVersion; |
| 118 | if(data.fastLatinTable != NULL) { |
| 119 | fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16; |
| 120 | } else { |
| 121 | fastLatinVersion = 0; |
| 122 | } |
| 123 | int32_t fastLatinTableLength = 0; |
| 124 | |
| 125 | if(isBase) { |
| 126 | // For the root collator, we write an even number of indexes |
| 127 | // so that we start with an 8-aligned offset. |
| 128 | indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; |
| 129 | U_ASSERT(settings.reorderCodesLength == 0); |
| 130 | hasMappings = TRUE; |
| 131 | unsafeBackwardSet = *data.unsafeBackwardSet; |
| 132 | fastLatinTableLength = data.fastLatinTableLength; |
| 133 | } else if(baseData == NULL) { |
| 134 | hasMappings = FALSE; |
| 135 | if(settings.reorderCodesLength == 0) { |
| 136 | // only options |
| 137 | indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here |
| 138 | } else { |
| 139 | // only options, reorder codes, and the reorder table |
| 140 | indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; |
| 141 | } |
| 142 | } else { |
| 143 | hasMappings = TRUE; |
| 144 | // Tailored mappings, and what else? |
| 145 | // Check in ascending order of optional tailoring data items. |
| 146 | indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; |
| 147 | if(data.contextsLength != 0) { |
| 148 | indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; |
| 149 | } |
| 150 | unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); |
| 151 | if(!unsafeBackwardSet.isEmpty()) { |
| 152 | indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; |
| 153 | } |
| 154 | if(data.fastLatinTable != baseData->fastLatinTable) { |
| 155 | fastLatinTableLength = data.fastLatinTableLength; |
| 156 | indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | int32_t headerSize; |
| 161 | if(isBase) { |
| 162 | headerSize = 0; // udata_create() writes the header |
| 163 | } else { |
| 164 | DataHeader header; |
| 165 | header.dataHeader.magic1 = 0xda; |
| 166 | header.dataHeader.magic2 = 0x27; |
| 167 | uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); |
| 168 | uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); |
| 169 | headerSize = (int32_t)sizeof(header); |
| 170 | U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes |
| 171 | if(hasMappings && data.cesLength != 0) { |
| 172 | // Sum of the sizes of the data items which are |
| 173 | // not automatically multiples of 8 bytes and which are placed before the CEs. |
| 174 | int32_t sum = headerSize + (indexesLength + settings.reorderCodesLength) * 4; |
| 175 | if((sum & 7) != 0) { |
| 176 | // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. |
| 177 | // We add to the header size here. |
| 178 | // Alternatively, we could increment the indexesLength |
| 179 | // or add a few bytes to the reorderTable. |
| 180 | headerSize += 4; |
| 181 | } |
| 182 | } |
| 183 | header.dataHeader.headerSize = (uint16_t)headerSize; |
| 184 | if(headerSize <= capacity) { |
| 185 | uprv_memcpy(dest, &header, sizeof(header)); |
| 186 | // Write 00 bytes so that the padding is not mistaken for a copyright string. |
| 187 | uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); |
| 188 | dest += headerSize; |
| 189 | capacity -= headerSize; |
| 190 | } else { |
| 191 | dest = NULL; |
| 192 | capacity = 0; |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; |
| 197 | U_ASSERT((settings.options & ~0xffff) == 0); |
| 198 | indexes[CollationDataReader::IX_OPTIONS] = |
| 199 | data.numericPrimary | fastLatinVersion | settings.options; |
| 200 | indexes[CollationDataReader::IX_RESERVED2] = 0; |
| 201 | indexes[CollationDataReader::IX_RESERVED3] = 0; |
| 202 | |
| 203 | // Byte offsets of data items all start from the start of the indexes. |
| 204 | // We add the headerSize at the very end. |
| 205 | int32_t totalSize = indexesLength * 4; |
| 206 | |
| 207 | if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { |
| 208 | indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s; |
| 209 | } else { |
| 210 | indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; |
| 211 | } |
| 212 | |
| 213 | indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; |
| 214 | totalSize += settings.reorderCodesLength * 4; |
| 215 | |
| 216 | indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; |
| 217 | if(settings.reorderTable != NULL) { |
| 218 | totalSize += 256; |
| 219 | } |
| 220 | |
| 221 | indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; |
| 222 | if(hasMappings) { |
| 223 | UErrorCode errorCode2 = U_ZERO_ERROR; |
| 224 | int32_t length; |
| 225 | if(totalSize < capacity) { |
| 226 | length = utrie2_serialize(data.trie, dest + totalSize, |
| 227 | capacity - totalSize, &errorCode2); |
| 228 | } else { |
| 229 | length = utrie2_serialize(data.trie, NULL, 0, &errorCode2); |
| 230 | } |
| 231 | if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { |
| 232 | errorCode = errorCode2; |
| 233 | return 0; |
| 234 | } |
| 235 | // The trie size should be a multiple of 8 bytes due to the way |
| 236 | // compactIndex2(UNewTrie2 *trie) currently works. |
| 237 | U_ASSERT((length & 7) == 0); |
| 238 | totalSize += length; |
| 239 | } |
| 240 | |
| 241 | indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; |
| 242 | indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; |
| 243 | if(hasMappings && data.cesLength != 0) { |
| 244 | U_ASSERT(((headerSize + totalSize) & 7) == 0); |
| 245 | totalSize += data.cesLength * 8; |
| 246 | } |
| 247 | |
| 248 | indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; |
| 249 | indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; |
| 250 | if(hasMappings) { |
| 251 | totalSize += data.ce32sLength * 4; |
| 252 | } |
| 253 | |
| 254 | indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; |
| 255 | totalSize += rootElementsLength * 4; |
| 256 | |
| 257 | indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; |
| 258 | if(hasMappings) { |
| 259 | totalSize += data.contextsLength * 2; |
| 260 | } |
| 261 | |
| 262 | indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; |
| 263 | if(hasMappings && !unsafeBackwardSet.isEmpty()) { |
| 264 | UErrorCode errorCode2 = U_ZERO_ERROR; |
| 265 | int32_t length; |
| 266 | if(totalSize < capacity) { |
| 267 | uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); |
| 268 | length = unsafeBackwardSet.serialize( |
| 269 | p, (capacity - totalSize) / 2, errorCode2); |
| 270 | } else { |
| 271 | length = unsafeBackwardSet.serialize(NULL, 0, errorCode2); |
| 272 | } |
| 273 | if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { |
| 274 | errorCode = errorCode2; |
| 275 | return 0; |
| 276 | } |
| 277 | totalSize += length * 2; |
| 278 | } |
| 279 | |
| 280 | indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; |
| 281 | totalSize += fastLatinTableLength * 2; |
| 282 | |
| 283 | indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; |
| 284 | if(isBase) { |
| 285 | totalSize += data.scriptsLength * 2; |
| 286 | } |
| 287 | |
| 288 | indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; |
| 289 | if(isBase) { |
| 290 | totalSize += 256; |
| 291 | } |
| 292 | |
| 293 | indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; |
| 294 | indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; |
| 295 | |
| 296 | if(totalSize > capacity) { |
| 297 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 298 | return headerSize + totalSize; |
| 299 | } |
| 300 | |
| 301 | uprv_memcpy(dest, indexes, indexesLength * 4); |
| 302 | copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, settings.reorderCodes, dest); |
| 303 | copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); |
| 304 | // The trie has already been serialized into the dest buffer. |
| 305 | copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); |
| 306 | copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); |
| 307 | copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); |
| 308 | copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); |
| 309 | // The unsafeBackwardSet has already been serialized into the dest buffer. |
| 310 | copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); |
| 311 | copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, data.scripts, dest); |
| 312 | copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); |
| 313 | |
| 314 | return headerSize + totalSize; |
| 315 | } |
| 316 | |
| 317 | void |
| 318 | CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex, |
| 319 | const void *src, uint8_t *dest) { |
| 320 | int32_t start = indexes[startIndex]; |
| 321 | int32_t limit = indexes[startIndex + 1]; |
| 322 | if(start < limit) { |
| 323 | uprv_memcpy(dest + start, src, limit - start); |
| 324 | } |
| 325 | } |
| 326 | |
| 327 | U_NAMESPACE_END |
| 328 | |
| 329 | #endif // !UCONFIG_NO_COLLATION |