Blame - source/i18n/collationdatawriter.cpp - chromium.googlesource.com/chromium/deps/icu

blob: dc600aff524de4d8ba173abc239e3ac65a5c4f48 [file] [log] [blame]

Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1	/*
				2	*******************************************************************************
				3	* Copyright (C) 2013-2014, International Business Machines
				4	* Corporation and others. All Rights Reserved.
				5	*******************************************************************************
				6	* collationdatawriter.cpp
				7	*
				8	* created on: 2013aug06
				9	* created by: Markus W. Scherer
				10	*/
				11
				12	#include "unicode/utypes.h"
				13
				14	#if !UCONFIG_NO_COLLATION
				15
				16	#include "unicode/tblcoll.h"
				17	#include "unicode/udata.h"
				18	#include "unicode/uniset.h"
				19	#include "cmemory.h"
				20	#include "collationdata.h"
				21	#include "collationdatabuilder.h"
				22	#include "collationdatareader.h"
				23	#include "collationdatawriter.h"
				24	#include "collationfastlatin.h"
				25	#include "collationsettings.h"
				26	#include "collationtailoring.h"
				27	#include "uassert.h"
				28	#include "ucmndata.h"
				29
				30	U_NAMESPACE_BEGIN
				31
				32	uint8_t *
				33	RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
				34	if(U_FAILURE(errorCode)) { return NULL; }
				35	LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
				36	if(buffer.isNull()) {
				37	errorCode = U_MEMORY_ALLOCATION_ERROR;
				38	return NULL;
				39	}
				40	length = cloneBinary(buffer.getAlias(), 20000, errorCode);
				41	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
				42	if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
				43	errorCode = U_MEMORY_ALLOCATION_ERROR;
				44	return NULL;
				45	}
				46	errorCode = U_ZERO_ERROR;
				47	length = cloneBinary(buffer.getAlias(), length, errorCode);
				48	}
				49	if(U_FAILURE(errorCode)) { return NULL; }
				50	return buffer.orphan();
				51	}
				52
				53	int32_t
				54	RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
				55	int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
				56	return CollationDataWriter::writeTailoring(
				57	tailoring, settings, indexes, dest, capacity,
				58	errorCode);
				59	}
				60
				61	static const UDataInfo dataInfo = {
				62	sizeof(UDataInfo),
				63	0,
				64
				65	U_IS_BIG_ENDIAN,
				66	U_CHARSET_FAMILY,
				67	U_SIZEOF_UCHAR,
				68	0,
				69
				70	{ 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
				71	{ 4, 0, 0, 0 }, // formatVersion
				72	{ 6, 3, 0, 0 } // dataVersion
				73	};
				74
				75	int32_t
				76	CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
				77	const void *rootElements, int32_t rootElementsLength,
				78	int32_t indexes[], uint8_t *dest, int32_t capacity,
				79	UErrorCode &errorCode) {
				80	return write(TRUE, NULL,
				81	data, settings,
				82	rootElements, rootElementsLength,
				83	indexes, dest, capacity, errorCode);
				84	}
				85
				86	int32_t
				87	CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
				88	int32_t indexes[], uint8_t *dest, int32_t capacity,
				89	UErrorCode &errorCode) {
				90	return write(FALSE, t.version,
				91	*t.data, settings,
				92	NULL, 0,
				93	indexes, dest, capacity, errorCode);
				94	}
				95
				96	int32_t
				97	CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
				98	const CollationData &data, const CollationSettings &settings,
				99	const void *rootElements, int32_t rootElementsLength,
				100	int32_t indexes[], uint8_t *dest, int32_t capacity,
				101	UErrorCode &errorCode) {
				102	if(U_FAILURE(errorCode)) { return 0; }
				103	if(capacity < 0 \|\| (capacity > 0 && dest == NULL)) {
				104	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
				105	return 0;
				106	}
				107
				108	// Figure out which data items to write before settling on
				109	// the indexes length and writing offsets.
				110	// For any data item, we need to write the start and limit offsets,
				111	// so the indexes length must be at least index-of-start-offset + 2.
				112	int32_t indexesLength;
				113	UBool hasMappings;
				114	UnicodeSet unsafeBackwardSet;
				115	const CollationData *baseData = data.base;
				116
				117	int32_t fastLatinVersion;
				118	if(data.fastLatinTable != NULL) {
				119	fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
				120	} else {
				121	fastLatinVersion = 0;
				122	}
				123	int32_t fastLatinTableLength = 0;
				124
				125	if(isBase) {
				126	// For the root collator, we write an even number of indexes
				127	// so that we start with an 8-aligned offset.
				128	indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
				129	U_ASSERT(settings.reorderCodesLength == 0);
				130	hasMappings = TRUE;
				131	unsafeBackwardSet = *data.unsafeBackwardSet;
				132	fastLatinTableLength = data.fastLatinTableLength;
				133	} else if(baseData == NULL) {
				134	hasMappings = FALSE;
				135	if(settings.reorderCodesLength == 0) {
				136	// only options
				137	indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here
				138	} else {
				139	// only options, reorder codes, and the reorder table
				140	indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
				141	}
				142	} else {
				143	hasMappings = TRUE;
				144	// Tailored mappings, and what else?
				145	// Check in ascending order of optional tailoring data items.
				146	indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
				147	if(data.contextsLength != 0) {
				148	indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
				149	}
				150	unsafeBackwardSet.addAll(data.unsafeBackwardSet).removeAll(baseData->unsafeBackwardSet);
				151	if(!unsafeBackwardSet.isEmpty()) {
				152	indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
				153	}
				154	if(data.fastLatinTable != baseData->fastLatinTable) {
				155	fastLatinTableLength = data.fastLatinTableLength;
				156	indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
				157	}
				158	}
				159
				160	int32_t headerSize;
				161	if(isBase) {
				162	headerSize = 0; // udata_create() writes the header
				163	} else {
				164	DataHeader header;
				165	header.dataHeader.magic1 = 0xda;
				166	header.dataHeader.magic2 = 0x27;
				167	uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
				168	uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
				169	headerSize = (int32_t)sizeof(header);
				170	U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes
				171	if(hasMappings && data.cesLength != 0) {
				172	// Sum of the sizes of the data items which are
				173	// not automatically multiples of 8 bytes and which are placed before the CEs.
				174	int32_t sum = headerSize + (indexesLength + settings.reorderCodesLength) * 4;
				175	if((sum & 7) != 0) {
				176	// We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
				177	// We add to the header size here.
				178	// Alternatively, we could increment the indexesLength
				179	// or add a few bytes to the reorderTable.
				180	headerSize += 4;
				181	}
				182	}
				183	header.dataHeader.headerSize = (uint16_t)headerSize;
				184	if(headerSize <= capacity) {
				185	uprv_memcpy(dest, &header, sizeof(header));
				186	// Write 00 bytes so that the padding is not mistaken for a copyright string.
				187	uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
				188	dest += headerSize;
				189	capacity -= headerSize;
				190	} else {
				191	dest = NULL;
				192	capacity = 0;
				193	}
				194	}
				195
				196	indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
				197	U_ASSERT((settings.options & ~0xffff) == 0);
				198	indexes[CollationDataReader::IX_OPTIONS] =
				199	data.numericPrimary \| fastLatinVersion \| settings.options;
				200	indexes[CollationDataReader::IX_RESERVED2] = 0;
				201	indexes[CollationDataReader::IX_RESERVED3] = 0;
				202
				203	// Byte offsets of data items all start from the start of the indexes.
				204	// We add the headerSize at the very end.
				205	int32_t totalSize = indexesLength * 4;
				206
				207	if(hasMappings && (isBase \|\| data.jamoCE32s != baseData->jamoCE32s)) {
				208	indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
				209	} else {
				210	indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
				211	}
				212
				213	indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
				214	totalSize += settings.reorderCodesLength * 4;
				215
				216	indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
				217	if(settings.reorderTable != NULL) {
				218	totalSize += 256;
				219	}
				220
				221	indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
				222	if(hasMappings) {
				223	UErrorCode errorCode2 = U_ZERO_ERROR;
				224	int32_t length;
				225	if(totalSize < capacity) {
				226	length = utrie2_serialize(data.trie, dest + totalSize,
				227	capacity - totalSize, &errorCode2);
				228	} else {
				229	length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
				230	}
				231	if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
				232	errorCode = errorCode2;
				233	return 0;
				234	}
				235	// The trie size should be a multiple of 8 bytes due to the way
				236	// compactIndex2(UNewTrie2 *trie) currently works.
				237	U_ASSERT((length & 7) == 0);
				238	totalSize += length;
				239	}
				240
				241	indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
				242	indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
				243	if(hasMappings && data.cesLength != 0) {
				244	U_ASSERT(((headerSize + totalSize) & 7) == 0);
				245	totalSize += data.cesLength * 8;
				246	}
				247
				248	indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
				249	indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
				250	if(hasMappings) {
				251	totalSize += data.ce32sLength * 4;
				252	}
				253
				254	indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
				255	totalSize += rootElementsLength * 4;
				256
				257	indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
				258	if(hasMappings) {
				259	totalSize += data.contextsLength * 2;
				260	}
				261
				262	indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
				263	if(hasMappings && !unsafeBackwardSet.isEmpty()) {
				264	UErrorCode errorCode2 = U_ZERO_ERROR;
				265	int32_t length;
				266	if(totalSize < capacity) {
				267	uint16_t p = reinterpret_cast<uint16_t >(dest + totalSize);
				268	length = unsafeBackwardSet.serialize(
				269	p, (capacity - totalSize) / 2, errorCode2);
				270	} else {
				271	length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
				272	}
				273	if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
				274	errorCode = errorCode2;
				275	return 0;
				276	}
				277	totalSize += length * 2;
				278	}
				279
				280	indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
				281	totalSize += fastLatinTableLength * 2;
				282
				283	indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
				284	if(isBase) {
				285	totalSize += data.scriptsLength * 2;
				286	}
				287
				288	indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
				289	if(isBase) {
				290	totalSize += 256;
				291	}
				292
				293	indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
				294	indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
				295
				296	if(totalSize > capacity) {
				297	errorCode = U_BUFFER_OVERFLOW_ERROR;
				298	return headerSize + totalSize;
				299	}
				300
				301	uprv_memcpy(dest, indexes, indexesLength * 4);
				302	copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, settings.reorderCodes, dest);
				303	copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
				304	// The trie has already been serialized into the dest buffer.
				305	copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
				306	copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
				307	copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
				308	copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
				309	// The unsafeBackwardSet has already been serialized into the dest buffer.
				310	copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
				311	copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, data.scripts, dest);
				312	copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
				313
				314	return headerSize + totalSize;
				315	}
				316
				317	void
				318	CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
				319	const void src, uint8_t dest) {
				320	int32_t start = indexes[startIndex];
				321	int32_t limit = indexes[startIndex + 1];
				322	if(start < limit) {
				323	uprv_memcpy(dest + start, src, limit - start);
				324	}
				325	}
				326
				327	U_NAMESPACE_END
				328
				329	#endif // !UCONFIG_NO_COLLATION