Blame - source/i18n/collationdatareader.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 519b5422ef6588dfc237323592245912d6955012 [file] [log] [blame]

Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1	/*
				2	*******************************************************************************
				3	* Copyright (C) 2013-2014, International Business Machines
				4	* Corporation and others. All Rights Reserved.
				5	*******************************************************************************
				6	* collationdatareader.cpp
				7	*
				8	* created on: 2013feb07
				9	* created by: Markus W. Scherer
				10	*/
				11
				12	#include "unicode/utypes.h"
				13
				14	#if !UCONFIG_NO_COLLATION
				15
				16	#include "unicode/ucol.h"
				17	#include "unicode/udata.h"
				18	#include "unicode/uscript.h"
				19	#include "cmemory.h"
				20	#include "collation.h"
				21	#include "collationdata.h"
				22	#include "collationdatareader.h"
				23	#include "collationfastlatin.h"
				24	#include "collationkeys.h"
				25	#include "collationrootelements.h"
				26	#include "collationsettings.h"
				27	#include "collationtailoring.h"
				28	#include "normalizer2impl.h"
				29	#include "uassert.h"
				30	#include "ucmndata.h"
				31	#include "utrie2.h"
				32
				33	U_NAMESPACE_BEGIN
				34
				35	namespace {
				36
				37	int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
				38	return (i < length) ? indexes[i] : -1;
				39	}
				40
				41	} // namespace
				42
				43	void
				44	CollationDataReader::read(const CollationTailoring base, const uint8_t inBytes, int32_t inLength,
				45	CollationTailoring &tailoring, UErrorCode &errorCode) {
				46	if(U_FAILURE(errorCode)) { return; }
				47	if(base != NULL) {
				48	if(inBytes == NULL \|\| (0 <= inLength && inLength < 24)) {
				49	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
				50	return;
				51	}
				52	const DataHeader header = reinterpret_cast<const DataHeader >(inBytes);
				53	if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
				54	isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
				55	errorCode = U_INVALID_FORMAT_ERROR;
				56	return;
				57	}
				58	if(base->getUCAVersion() != tailoring.getUCAVersion()) {
				59	errorCode = U_COLLATOR_VERSION_MISMATCH;
				60	return;
				61	}
				62	int32_t headerLength = header->dataHeader.headerSize;
				63	inBytes += headerLength;
				64	if(inLength >= 0) {
				65	inLength -= headerLength;
				66	}
				67	}
				68
				69	if(inBytes == NULL \|\| (0 <= inLength && inLength < 8)) {
				70	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
				71	return;
				72	}
				73	const int32_t inIndexes = reinterpret_cast<const int32_t >(inBytes);
				74	int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
				75	if(indexesLength < 2 \|\| (0 <= inLength && inLength < indexesLength * 4)) {
				76	errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
				77	return;
				78	}
				79
				80	// Assume that the tailoring data is in initial state,
				81	// with NULL pointers and 0 lengths.
				82
				83	// Set pointers to non-empty data parts.
				84	// Do this in order of their byte offsets. (Should help porting to Java.)
				85
				86	int32_t index; // one of the indexes[] slots
				87	int32_t offset; // byte offset for the index part
				88	int32_t length; // number of bytes in the index part
				89
				90	if(indexesLength > IX_TOTAL_SIZE) {
				91	length = inIndexes[IX_TOTAL_SIZE];
				92	} else if(indexesLength > IX_REORDER_CODES_OFFSET) {
				93	length = inIndexes[indexesLength - 1];
				94	} else {
				95	length = 0; // only indexes, and inLength was already checked for them
				96	}
				97	if(0 <= inLength && inLength < length) {
				98	errorCode = U_INVALID_FORMAT_ERROR;
				99	return;
				100	}
				101
				102	const CollationData *baseData = base == NULL ? NULL : base->data;
				103	const int32_t *reorderCodes = NULL;
				104	int32_t reorderCodesLength = 0;
				105	index = IX_REORDER_CODES_OFFSET;
				106	offset = getIndex(inIndexes, indexesLength, index);
				107	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				108	if(length >= 4) {
				109	if(baseData == NULL) {
				110	// We assume for collation settings that
				111	// the base data does not have a reordering.
				112	errorCode = U_INVALID_FORMAT_ERROR;
				113	return;
				114	}
				115	reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
				116	reorderCodesLength = length / 4;
				117	}
				118
				119	// There should be a reorder table only if there are reorder codes.
				120	// However, when there are reorder codes the reorder table may be omitted to reduce
				121	// the data size.
				122	const uint8_t *reorderTable = NULL;
				123	index = IX_REORDER_TABLE_OFFSET;
				124	offset = getIndex(inIndexes, indexesLength, index);
				125	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				126	if(length >= 256) {
				127	if(reorderCodesLength == 0) {
				128	errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.
				129	return;
				130	}
				131	reorderTable = inBytes + offset;
				132	} else {
				133	// If we have reorder codes, then build the reorderTable at the end,
				134	// when the CollationData is otherwise complete.
				135	}
				136
				137	if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
				138	errorCode = U_INVALID_FORMAT_ERROR;
				139	return;
				140	}
				141	CollationData *data = NULL; // Remains NULL if there are no mappings.
				142
				143	index = IX_TRIE_OFFSET;
				144	offset = getIndex(inIndexes, indexesLength, index);
				145	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				146	if(length >= 8) {
				147	if(!tailoring.ensureOwnedData(errorCode)) { return; }
				148	data = tailoring.ownedData;
				149	data->base = baseData;
				150	data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
				151	data->trie = tailoring.trie = utrie2_openFromSerialized(
				152	UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
				153	&errorCode);
				154	if(U_FAILURE(errorCode)) { return; }
				155	} else if(baseData != NULL) {
				156	// Use the base data. Only the settings are tailored.
				157	tailoring.data = baseData;
				158	} else {
				159	errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
				160	return;
				161	}
				162
				163	index = IX_CES_OFFSET;
				164	offset = getIndex(inIndexes, indexesLength, index);
				165	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				166	if(length >= 8) {
				167	if(data == NULL) {
				168	errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.
				169	return;
				170	}
				171	data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
				172	data->cesLength = length / 8;
				173	}
				174
				175	index = IX_CE32S_OFFSET;
				176	offset = getIndex(inIndexes, indexesLength, index);
				177	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				178	if(length >= 4) {
				179	if(data == NULL) {
				180	errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.
				181	return;
				182	}
				183	data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
				184	data->ce32sLength = length / 4;
				185	}
				186
				187	int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
				188	if(jamoCE32sStart >= 0) {
				189	if(data == NULL \|\| data->ce32s == NULL) {
				190	errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].
				191	return;
				192	}
				193	data->jamoCE32s = data->ce32s + jamoCE32sStart;
				194	} else if(data == NULL) {
				195	// Nothing to do.
				196	} else if(baseData != NULL) {
				197	data->jamoCE32s = baseData->jamoCE32s;
				198	} else {
				199	errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.
				200	return;
				201	}
				202
				203	index = IX_ROOT_ELEMENTS_OFFSET;
				204	offset = getIndex(inIndexes, indexesLength, index);
				205	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				206	if(length >= 4) {
				207	length /= 4;
				208	if(data == NULL \|\| length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
				209	errorCode = U_INVALID_FORMAT_ERROR;
				210	return;
				211	}
				212	data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
				213	data->rootElementsLength = length;
				214	uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
				215	if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
				216	errorCode = U_INVALID_FORMAT_ERROR;
				217	return;
				218	}
				219	uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
				220	if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
				221	// [fixed last secondary common byte] is too low,
				222	// and secondary weights would collide with compressed common secondaries.
				223	errorCode = U_INVALID_FORMAT_ERROR;
				224	return;
				225	}
				226	}
				227
				228	index = IX_CONTEXTS_OFFSET;
				229	offset = getIndex(inIndexes, indexesLength, index);
				230	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				231	if(length >= 2) {
				232	if(data == NULL) {
				233	errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.
				234	return;
				235	}
				236	data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
				237	data->contextsLength = length / 2;
				238	}
				239
				240	index = IX_UNSAFE_BWD_OFFSET;
				241	offset = getIndex(inIndexes, indexesLength, index);
				242	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				243	if(length >= 2) {
				244	if(data == NULL) {
				245	errorCode = U_INVALID_FORMAT_ERROR;
				246	return;
				247	}
				248	if(baseData == NULL) {
				249	// Create the unsafe-backward set for the root collator.
				250	// Include all non-zero combining marks and trail surrogates.
				251	// We do this at load time, rather than at build time,
				252	// to simplify Unicode version bootstrapping:
				253	// The root data builder only needs the new FractionalUCA.txt data,
				254	// but it need not be built with a version of ICU already updated to
				255	// the corresponding new Unicode Character Database.
				256	//
				257	// The following is an optimized version of
				258	// new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
				259	// It is faster and requires fewer code dependencies.
				260	tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
				261	if(tailoring.unsafeBackwardSet == NULL) {
				262	errorCode = U_MEMORY_ALLOCATION_ERROR;
				263	return;
				264	}
				265	data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
				266	} else {
				267	// Clone the root collator's set contents.
				268	tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
				269	baseData->unsafeBackwardSet->cloneAsThawed());
				270	if(tailoring.unsafeBackwardSet == NULL) {
				271	errorCode = U_MEMORY_ALLOCATION_ERROR;
				272	return;
				273	}
				274	}
				275	// Add the ranges from the data file to the unsafe-backward set.
				276	USerializedSet sset;
				277	const uint16_t unsafeData = reinterpret_cast<const uint16_t >(inBytes + offset);
				278	if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
				279	errorCode = U_INVALID_FORMAT_ERROR;
				280	return;
				281	}
				282	int32_t count = uset_getSerializedRangeCount(&sset);
				283	for(int32_t i = 0; i < count; ++i) {
				284	UChar32 start, end;
				285	uset_getSerializedRange(&sset, i, &start, &end);
				286	tailoring.unsafeBackwardSet->add(start, end);
				287	}
				288	// Mark each lead surrogate as "unsafe"
				289	// if any of its 1024 associated supplementary code points is "unsafe".
				290	UChar32 c = 0x10000;
				291	for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
				292	if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
				293	tailoring.unsafeBackwardSet->add(lead);
				294	}
				295	}
				296	tailoring.unsafeBackwardSet->freeze();
				297	data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
				298	} else if(data == NULL) {
				299	// Nothing to do.
				300	} else if(baseData != NULL) {
				301	// No tailoring-specific data: Alias the root collator's set.
				302	data->unsafeBackwardSet = baseData->unsafeBackwardSet;
				303	} else {
				304	errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
				305	return;
				306	}
				307
				308	// If the fast Latin format version is different,
				309	// or the version is set to 0 for "no fast Latin table",
				310	// then just always use the normal string comparison path.
				311	if(data != NULL) {
				312	data->fastLatinTable = NULL;
				313	data->fastLatinTableLength = 0;
				314	if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
				315	index = IX_FAST_LATIN_TABLE_OFFSET;
				316	offset = getIndex(inIndexes, indexesLength, index);
				317	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				318	if(length >= 2) {
				319	data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
				320	data->fastLatinTableLength = length / 2;
				321	if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
				322	errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch
				323	return;
				324	}
				325	} else if(baseData != NULL) {
				326	data->fastLatinTable = baseData->fastLatinTable;
				327	data->fastLatinTableLength = baseData->fastLatinTableLength;
				328	}
				329	}
				330	}
				331
				332	index = IX_SCRIPTS_OFFSET;
				333	offset = getIndex(inIndexes, indexesLength, index);
				334	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				335	if(length >= 2) {
				336	if(data == NULL) {
				337	errorCode = U_INVALID_FORMAT_ERROR;
				338	return;
				339	}
				340	data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
				341	data->scriptsLength = length / 2;
				342	} else if(data == NULL) {
				343	// Nothing to do.
				344	} else if(baseData != NULL) {
				345	data->scripts = baseData->scripts;
				346	data->scriptsLength = baseData->scriptsLength;
				347	}
				348
				349	index = IX_COMPRESSIBLE_BYTES_OFFSET;
				350	offset = getIndex(inIndexes, indexesLength, index);
				351	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
				352	if(length >= 256) {
				353	if(data == NULL) {
				354	errorCode = U_INVALID_FORMAT_ERROR;
				355	return;
				356	}
				357	data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
				358	} else if(data == NULL) {
				359	// Nothing to do.
				360	} else if(baseData != NULL) {
				361	data->compressibleBytes = baseData->compressibleBytes;
				362	} else {
				363	errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
				364	return;
				365	}
				366
				367	const CollationSettings &ts = *tailoring.settings;
				368	int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
				369	uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
				370	int32_t fastLatinOptions = CollationFastLatin::getOptions(
				371	tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
				372	if(options == ts.options && ts.variableTop != 0 &&
				373	reorderCodesLength == ts.reorderCodesLength &&
				374	uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 &&
				375	fastLatinOptions == ts.fastLatinOptions &&
				376	(fastLatinOptions < 0 \|\|
				377	uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
				378	sizeof(fastLatinPrimaries)) == 0)) {
				379	return;
				380	}
				381
				382	CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
				383	if(settings == NULL) {
				384	errorCode = U_MEMORY_ALLOCATION_ERROR;
				385	return;
				386	}
				387	settings->options = options;
				388	// Set variableTop from options and scripts data.
				389	settings->variableTop = tailoring.data->getLastPrimaryForGroup(
				390	UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
				391	if(settings->variableTop == 0) {
				392	errorCode = U_INVALID_FORMAT_ERROR;
				393	return;
				394	}
				395
				396	if(reorderCodesLength == 0 \|\| reorderTable != NULL) {
				397	settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable);
				398	} else {
				399	uint8_t table[256];
				400	baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, errorCode);
				401	if(U_FAILURE(errorCode)) { return; }
				402	if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
				403	errorCode = U_MEMORY_ALLOCATION_ERROR;
				404	return;
				405	}
				406	}
				407
				408	settings->fastLatinOptions = CollationFastLatin::getOptions(
				409	tailoring.data, *settings,
				410	settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
				411	}
				412
				413	UBool U_CALLCONV
				414	CollationDataReader::isAcceptable(void *context,
				415	const char * /* type /, const char /name/,
				416	const UDataInfo *pInfo) {
				417	if(
				418	pInfo->size >= 20 &&
				419	pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
				420	pInfo->charsetFamily == U_CHARSET_FAMILY &&
				421	pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
				422	pInfo->dataFormat[1] == 0x43 &&
				423	pInfo->dataFormat[2] == 0x6f &&
				424	pInfo->dataFormat[3] == 0x6c &&
				425	pInfo->formatVersion[0] == 4
				426	) {
				427	UVersionInfo version = static_cast<UVersionInfo >(context);
				428	if(version != NULL) {
				429	uprv_memcpy(version, pInfo->dataVersion, 4);
				430	}
				431	return TRUE;
				432	} else {
				433	return FALSE;
				434	}
				435	}
				436
				437	U_NAMESPACE_END
				438
				439	#endif // !UCONFIG_NO_COLLATION