Blame - source/i18n/uspoof_impl.cpp - chromium.googlesource.com/chromium/deps/icu

blob: e9077d3ac3414027735b4cbc8880fb92a6e9a996 [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1	/*
				2	**********************************************************************
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	3	* Copyright (C) 2008-2015, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	4	* Corporation and others. All Rights Reserved.
				5	**********************************************************************
				6	*/
				7
				8	#include "unicode/utypes.h"
				9	#include "unicode/uspoof.h"
				10	#include "unicode/uchar.h"
				11	#include "unicode/uniset.h"
				12	#include "unicode/utf16.h"
				13	#include "utrie2.h"
				14	#include "cmemory.h"
				15	#include "cstring.h"
				16	#include "identifier_info.h"
				17	#include "scriptset.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	18	#include "umutex.h"
				19	#include "udataswp.h"
				20	#include "uassert.h"
				21	#include "uspoof_impl.h"
				22
				23	#if !UCONFIG_NO_NORMALIZATION
				24
				25
				26	U_NAMESPACE_BEGIN
				27
				28	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
				29
				30	SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	31	fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	32	fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
				33	if (U_FAILURE(status)) {
				34	return;
				35	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	36	fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
				37
				38	UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
				39	allowedCharsSet->freeze();
				40	fAllowedCharsSet = allowedCharsSet;
				41	fAllowedLocales = uprv_strdup("");
				42	if (fAllowedCharsSet == NULL \|\| fAllowedLocales == NULL) {
				43	status = U_MEMORY_ALLOCATION_ERROR;
				44	return;
				45	}
				46	fMagic = USPOOF_MAGIC;
				47	}
				48
				49
				50	SpoofImpl::SpoofImpl() :
				51	fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
				52	fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
				53	UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
				54	allowedCharsSet->freeze();
				55	fAllowedCharsSet = allowedCharsSet;
				56	fAllowedLocales = uprv_strdup("");
				57	fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
				58	}
				59
				60
				61	// Copy Constructor, used by the user level clone() function.
				62	SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
				63	fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
				64	fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
				65	if (U_FAILURE(status)) {
				66	return;
				67	}
				68	fMagic = src.fMagic;
				69	fChecks = src.fChecks;
				70	if (src.fSpoofData != NULL) {
				71	fSpoofData = src.fSpoofData->addReference();
				72	}
				73	fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
				74	if (fAllowedCharsSet == NULL) {
				75	status = U_MEMORY_ALLOCATION_ERROR;
				76	}
				77	fAllowedLocales = uprv_strdup(src.fAllowedLocales);
				78	fRestrictionLevel = src.fRestrictionLevel;
				79	}
				80
				81	SpoofImpl::~SpoofImpl() {
				82	fMagic = 0; // head off application errors by preventing use of
				83	// of deleted objects.
				84	if (fSpoofData != NULL) {
				85	fSpoofData->removeReference(); // Will delete if refCount goes to zero.
				86	}
				87	delete fAllowedCharsSet;
				88	uprv_free((void *)fAllowedLocales);
				89	delete fCachedIdentifierInfo;
				90	}
				91
				92	//
				93	// Incoming parameter check on Status and the SpoofChecker object
				94	// received from the C API.
				95	//
				96	const SpoofImpl SpoofImpl::validateThis(const USpoofChecker sc, UErrorCode &status) {
				97	if (U_FAILURE(status)) {
				98	return NULL;
				99	}
				100	if (sc == NULL) {
				101	status = U_ILLEGAL_ARGUMENT_ERROR;
				102	return NULL;
				103	}
				104	SpoofImpl This = (SpoofImpl )sc;
				105	if (This->fMagic != USPOOF_MAGIC \|\|
				106	This->fSpoofData == NULL) {
				107	status = U_INVALID_FORMAT_ERROR;
				108	return NULL;
				109	}
				110	if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
				111	return NULL;
				112	}
				113	return This;
				114	}
				115
				116	SpoofImpl SpoofImpl::validateThis(USpoofChecker sc, UErrorCode &status) {
				117	return const_cast<SpoofImpl *>
				118	(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
				119	}
				120
				121
				122
				123	//--------------------------------------------------------------------------------------
				124	//
				125	// confusableLookup() This is the heart of the confusable skeleton generation
				126	// implementation.
				127	//
				128	// Given a source character, produce the corresponding
				129	// replacement character(s), appending them to the dest string.
				130	//
				131	//---------------------------------------------------------------------------------------
				132	int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
				133
				134	// Binary search the spoof data key table for the inChar
				135	int32_t *low = fSpoofData->fCFUKeys;
				136	int32_t *mid = NULL;
				137	int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
				138	UChar32 midc;
				139	do {
				140	int32_t delta = ((int32_t)(limit-low))/2;
				141	mid = low + delta;
				142	midc = *mid & 0x1fffff;
				143	if (inChar == midc) {
				144	goto foundChar;
				145	} else if (inChar < midc) {
				146	limit = mid;
				147	} else {
				148	low = mid;
				149	}
				150	} while (low < limit-1);
				151	mid = low;
				152	midc = *mid & 0x1fffff;
				153	if (inChar != midc) {
				154	// Char not found. It maps to itself.
				155	int i = 0;
				156	dest.append(inChar);
				157	return i;
				158	}
				159	foundChar:
				160	int32_t keyFlags = *mid & 0xff000000;
				161	if ((keyFlags & tableMask) == 0) {
				162	// We found the right key char, but the entry doesn't pertain to the
				163	// table we need. See if there is an adjacent key that does
				164	if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
				165	int32_t *altMid;
				166	for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
				167	keyFlags = *altMid & 0xff000000;
				168	if (keyFlags & tableMask) {
				169	mid = altMid;
				170	goto foundKey;
				171	}
				172	}
				173	for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
				174	keyFlags = *altMid & 0xff000000;
				175	if (keyFlags & tableMask) {
				176	mid = altMid;
				177	goto foundKey;
				178	}
				179	}
				180	}
				181	// No key entry for this char & table.
				182	// The input char maps to itself.
				183	int i = 0;
				184	dest.append(inChar);
				185	return i;
				186	}
				187
				188	foundKey:
				189	int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
				190	int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
				191
				192	// Value is either a UChar (for strings of length 1) or
				193	// an index into the string table (for longer strings)
				194	uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
				195	if (stringLen == 1) {
				196	dest.append((UChar)value);
				197	return 1;
				198	}
				199
				200	// String length of 4 from the above lookup is used for all strings of length >= 4.
				201	// For these, get the real length from the string lengths table,
				202	// which maps string table indexes to lengths.
				203	// All strings of the same length are stored contiguously in the string table.
				204	// 'value' from the lookup above is the starting index for the desired string.
				205
				206	int32_t ix;
				207	if (stringLen == 4) {
				208	int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
				209	for (ix = 0; ix < stringLengthsLimit; ix++) {
				210	if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
				211	stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
				212	break;
				213	}
				214	}
				215	U_ASSERT(ix < stringLengthsLimit);
				216	}
				217
				218	U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
				219	UChar *src = &fSpoofData->fCFUStrings[value];
				220	dest.append(src, stringLen);
				221	return stringLen;
				222	}
				223
				224
				225	//---------------------------------------------------------------------------------------
				226	//
				227	// wholeScriptCheck()
				228	//
				229	// Input text is already normalized to NFD
				230	// Return the set of scripts, each of which can represent something that is
				231	// confusable with the input text. The script of the input text
				232	// is included; input consisting of characters from a single script will
				233	// always produce a result consisting of a set containing that script.
				234	//
				235	//---------------------------------------------------------------------------------------
				236	void SpoofImpl::wholeScriptCheck(
				237	const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
				238
				239	UTrie2 *table =
				240	(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
				241	result->setAll();
				242	int32_t length = text.length();
				243	for (int32_t inputIdx=0; inputIdx < length;) {
				244	UChar32 c = text.char32At(inputIdx);
				245	inputIdx += U16_LENGTH(c);
				246	uint32_t index = utrie2_get32(table, c);
				247	if (index == 0) {
				248	// No confusables in another script for this char.
				249	// TODO: we should change the data to have sets with just the single script
				250	// bit for the script of this char. Gets rid of this special case.
				251	// Until then, grab the script from the char and intersect it with the set.
				252	UScriptCode cpScript = uscript_getScript(c, &status);
				253	U_ASSERT(cpScript > USCRIPT_INHERITED);
				254	result->intersect(cpScript, status);
				255	} else if (index == 1) {
				256	// Script == Common or Inherited. Nothing to do.
				257	} else {
				258	result->intersect(fSpoofData->fScriptSets[index]);
				259	}
				260	}
				261	}
				262
				263
				264	void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
				265	UnicodeSet allowedChars;
				266	UnicodeSet *tmpSet = NULL;
				267	const char *locStart = localesList;
				268	const char *locEnd = NULL;
				269	const char *localesListEnd = localesList + uprv_strlen(localesList);
				270	int32_t localeListCount = 0; // Number of locales provided by caller.
				271
				272	// Loop runs once per locale from the localesList, a comma separated list of locales.
				273	do {
				274	locEnd = uprv_strchr(locStart, ',');
				275	if (locEnd == NULL) {
				276	locEnd = localesListEnd;
				277	}
				278	while (*locStart == ' ') {
				279	locStart++;
				280	}
				281	const char *trimmedEnd = locEnd-1;
				282	while (trimmedEnd > locStart && *trimmedEnd == ' ') {
				283	trimmedEnd--;
				284	}
				285	if (trimmedEnd <= locStart) {
				286	break;
				287	}
				288	const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
				289	localeListCount++;
				290
				291	// We have one locale from the locales list.
				292	// Add the script chars for this locale to the accumulating set of allowed chars.
				293	// If the locale is no good, we will be notified back via status.
				294	addScriptChars(locale, &allowedChars, status);
				295	uprv_free((void *)locale);
				296	if (U_FAILURE(status)) {
				297	break;
				298	}
				299	locStart = locEnd + 1;
				300	} while (locStart < localesListEnd);
				301
				302	// If our caller provided an empty list of locales, we disable the allowed characters checking
				303	if (localeListCount == 0) {
				304	uprv_free((void *)fAllowedLocales);
				305	fAllowedLocales = uprv_strdup("");
				306	tmpSet = new UnicodeSet(0, 0x10ffff);
				307	if (fAllowedLocales == NULL \|\| tmpSet == NULL) {
				308	status = U_MEMORY_ALLOCATION_ERROR;
				309	return;
				310	}
				311	tmpSet->freeze();
				312	delete fAllowedCharsSet;
				313	fAllowedCharsSet = tmpSet;
				314	fChecks &= ~USPOOF_CHAR_LIMIT;
				315	return;
				316	}
				317
				318
				319	// Add all common and inherited characters to the set of allowed chars.
				320	UnicodeSet tempSet;
				321	tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
				322	allowedChars.addAll(tempSet);
				323	tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
				324	allowedChars.addAll(tempSet);
				325
				326	// If anything went wrong, we bail out without changing
				327	// the state of the spoof checker.
				328	if (U_FAILURE(status)) {
				329	return;
				330	}
				331
				332	// Store the updated spoof checker state.
				333	tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
				334	const char *tmpLocalesList = uprv_strdup(localesList);
				335	if (tmpSet == NULL \|\| tmpLocalesList == NULL) {
				336	status = U_MEMORY_ALLOCATION_ERROR;
				337	return;
				338	}
				339	uprv_free((void *)fAllowedLocales);
				340	fAllowedLocales = tmpLocalesList;
				341	tmpSet->freeze();
				342	delete fAllowedCharsSet;
				343	fAllowedCharsSet = tmpSet;
				344	fChecks \|= USPOOF_CHAR_LIMIT;
				345	}
				346
				347
				348	const char * SpoofImpl::getAllowedLocales(UErrorCode &/status/) {
				349	return fAllowedLocales;
				350	}
				351
				352
				353	// Given a locale (a language), add all the characters from all of the scripts used with that language
				354	// to the allowedChars UnicodeSet
				355
				356	void SpoofImpl::addScriptChars(const char locale, UnicodeSet allowedChars, UErrorCode &status) {
				357	UScriptCode scripts[30];
				358
				359	int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
				360	if (U_FAILURE(status)) {
				361	return;
				362	}
				363	if (status == U_USING_DEFAULT_WARNING) {
				364	status = U_ILLEGAL_ARGUMENT_ERROR;
				365	return;
				366	}
				367	UnicodeSet tmpSet;
				368	int32_t i;
				369	for (i=0; i<numScripts; i++) {
				370	tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
				371	allowedChars->addAll(tmpSet);
				372	}
				373	}
				374
				375
				376	// Convert a text format hex number. Utility function used by builder code. Static.
				377	// Input: UChar *string text. Output: a UChar32
				378	// Input has been pre-checked, and will have no non-hex chars.
				379	// The number must fall in the code point range of 0..0x10ffff
				380	// Static Function.
				381	UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
				382	if (U_FAILURE(status)) {
				383	return 0;
				384	}
				385	U_ASSERT(limit-start > 0);
				386	uint32_t val = 0;
				387	int i;
				388	for (i=start; i<limit; i++) {
				389	int digitVal = s[i] - 0x30;
				390	if (digitVal>9) {
				391	digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
				392	}
				393	if (digitVal>15) {
				394	digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
				395	}
				396	U_ASSERT(digitVal <= 0xf);
				397	val <<= 4;
				398	val += digitVal;
				399	}
				400	if (val > 0x10ffff) {
				401	status = U_PARSE_ERROR;
				402	val = 0;
				403	}
				404	return (UChar32)val;
				405	}
				406
				407	// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
				408	// Maintain a one-element cache, which is sufficient to avoid repeatedly
				409	// creating new ones unless we get multi-thread concurrency in spoof
				410	// check operations, which should be statistically uncommon.
				411
				412	// These functions are used in place of new & delete of an IdentifierInfo.
				413	// They will recycle the IdentifierInfo when possible.
				414	// They are logically const, and used within const functions that must be thread safe.
				415	IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
				416	IdentifierInfo *returnIdInfo = NULL;
				417	if (U_FAILURE(status)) {
				418	return returnIdInfo;
				419	}
				420	SpoofImpl nonConstThis = const_cast<SpoofImpl >(this);
				421	{
				422	Mutex m;
				423	returnIdInfo = nonConstThis->fCachedIdentifierInfo;
				424	nonConstThis->fCachedIdentifierInfo = NULL;
				425	}
				426	if (returnIdInfo == NULL) {
				427	returnIdInfo = new IdentifierInfo(status);
				428	if (U_SUCCESS(status) && returnIdInfo == NULL) {
				429	status = U_MEMORY_ALLOCATION_ERROR;
				430	}
				431	if (U_FAILURE(status) && returnIdInfo != NULL) {
				432	delete returnIdInfo;
				433	returnIdInfo = NULL;
				434	}
				435	}
				436	return returnIdInfo;
				437	}
				438
				439
				440	void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
				441	if (idInfo != NULL) {
				442	SpoofImpl nonConstThis = const_cast<SpoofImpl >(this);
				443	{
				444	Mutex m;
				445	if (nonConstThis->fCachedIdentifierInfo == NULL) {
				446	nonConstThis->fCachedIdentifierInfo = idInfo;
				447	idInfo = NULL;
				448	}
				449	}
				450	delete idInfo;
				451	}
				452	}
				453
				454
				455
				456
				457	//----------------------------------------------------------------------------------------------
				458	//
				459	// class SpoofData Implementation
				460	//
				461	//----------------------------------------------------------------------------------------------
				462
				463
				464	UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
				465	if (U_FAILURE(status) \|\|
				466	rawData == NULL \|\|
				467	rawData->fMagic != USPOOF_MAGIC \|\|
				468	rawData->fFormatVersion[0] > 1 \|\|
				469	rawData->fFormatVersion[1] > 0) {
				470	status = U_INVALID_FORMAT_ERROR;
				471	return FALSE;
				472	}
				473	return TRUE;
				474	}
				475
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	476	static UBool U_CALLCONV
				477	spoofDataIsAcceptable(void *context,
				478	const char * /* type /, const char /name/,
				479	const UDataInfo *pInfo) {
				480	if(
				481	pInfo->size >= 20 &&
				482	pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
				483	pInfo->charsetFamily == U_CHARSET_FAMILY &&
				484	pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
				485	pInfo->dataFormat[1] == 0x66 &&
				486	pInfo->dataFormat[2] == 0x75 &&
				487	pInfo->dataFormat[3] == 0x20 &&
				488	pInfo->formatVersion[0] == 1
				489	) {
				490	UVersionInfo version = static_cast<UVersionInfo >(context);
				491	if(version != NULL) {
				492	uprv_memcpy(version, pInfo->dataVersion, 4);
				493	}
				494	return TRUE;
				495	} else {
				496	return FALSE;
				497	}
				498	}
				499
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	500	//
				501	// SpoofData::getDefault() - return a wrapper around the spoof data that is
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	502	// baked into the default ICU data.
				503	//
				504	// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
				505	// SpoofData is shared by all spoof checkers using the default data.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	506	//
				507	SpoofData *SpoofData::getDefault(UErrorCode &status) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	508	UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
				509	spoofDataIsAcceptable,
				510	NULL, // context, would receive dataVersion if supplied.
				511	&status);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	512	if (U_FAILURE(status)) {
				513	return NULL;
				514	}
				515	SpoofData *This = new SpoofData(udm, status);
				516	if (U_FAILURE(status)) {
				517	delete This;
				518	return NULL;
				519	}
				520	if (This == NULL) {
				521	status = U_MEMORY_ALLOCATION_ERROR;
				522	}
				523	return This;
				524	}
				525
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	526	SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
				527	{
				528	reset();
				529	if (U_FAILURE(status)) {
				530	return;
				531	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	532	fUDM = udm;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	533	// fRawData is non-const because it may be constructed by the data builder.
				534	fRawData = reinterpret_cast<SpoofDataHeader *>(
				535	const_cast<void *>(udata_getMemory(udm)));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	536	validateDataVersion(fRawData, status);
				537	initPtrs(status);
				538	}
				539
				540
				541	SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
				542	{
				543	reset();
				544	if (U_FAILURE(status)) {
				545	return;
				546	}
				547	if ((size_t)length < sizeof(SpoofDataHeader)) {
				548	status = U_INVALID_FORMAT_ERROR;
				549	return;
				550	}
				551	void ncData = const_cast<void >(data);
				552	fRawData = static_cast<SpoofDataHeader *>(ncData);
				553	if (length < fRawData->fLength) {
				554	status = U_INVALID_FORMAT_ERROR;
				555	return;
				556	}
				557	validateDataVersion(fRawData, status);
				558	initPtrs(status);
				559	}
				560
				561
				562	// Spoof Data constructor for use from data builder.
				563	// Initializes a new, empty data area that will be populated later.
				564	SpoofData::SpoofData(UErrorCode &status) {
				565	reset();
				566	if (U_FAILURE(status)) {
				567	return;
				568	}
				569	fDataOwned = true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	570
				571	// The spoof header should already be sized to be a multiple of 16 bytes.
				572	// Just in case it's not, round it up.
				573	uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
				574	U_ASSERT(initialSize == sizeof(SpoofDataHeader));
				575
				576	fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
				577	fMemLimit = initialSize;
				578	if (fRawData == NULL) {
				579	status = U_MEMORY_ALLOCATION_ERROR;
				580	return;
				581	}
				582	uprv_memset(fRawData, 0, initialSize);
				583
				584	fRawData->fMagic = USPOOF_MAGIC;
				585	fRawData->fFormatVersion[0] = 1;
				586	fRawData->fFormatVersion[1] = 0;
				587	fRawData->fFormatVersion[2] = 0;
				588	fRawData->fFormatVersion[3] = 0;
				589	initPtrs(status);
				590	}
				591
				592	// reset() - initialize all fields.
				593	// Should be updated if any new fields are added.
				594	// Called by constructors to put things in a known initial state.
				595	void SpoofData::reset() {
				596	fRawData = NULL;
				597	fDataOwned = FALSE;
				598	fUDM = NULL;
				599	fMemLimit = 0;
				600	fRefCount = 1;
				601	fCFUKeys = NULL;
				602	fCFUValues = NULL;
				603	fCFUStringLengths = NULL;
				604	fCFUStrings = NULL;
				605	fAnyCaseTrie = NULL;
				606	fLowerCaseTrie = NULL;
				607	fScriptSets = NULL;
				608	}
				609
				610
				611	// SpoofData::initPtrs()
				612	// Initialize the pointers to the various sections of the raw data.
				613	//
				614	// This function is used both during the Trie building process (multiple
				615	// times, as the individual data sections are added), and
				616	// during the opening of a Spoof Checker from prebuilt data.
				617	//
				618	// The pointers for non-existent data sections (identified by an offset of 0)
				619	// are set to NULL.
				620	//
				621	// Note: During building the data, adding each new data section
				622	// reallocs the raw data area, which likely relocates it, which
				623	// in turn requires reinitializing all of the pointers into it, hence
				624	// multiple calls to this function during building.
				625	//
				626	void SpoofData::initPtrs(UErrorCode &status) {
				627	fCFUKeys = NULL;
				628	fCFUValues = NULL;
				629	fCFUStringLengths = NULL;
				630	fCFUStrings = NULL;
				631	if (U_FAILURE(status)) {
				632	return;
				633	}
				634	if (fRawData->fCFUKeys != 0) {
				635	fCFUKeys = (int32_t )((char )fRawData + fRawData->fCFUKeys);
				636	}
				637	if (fRawData->fCFUStringIndex != 0) {
				638	fCFUValues = (uint16_t )((char )fRawData + fRawData->fCFUStringIndex);
				639	}
				640	if (fRawData->fCFUStringLengths != 0) {
				641	fCFUStringLengths = (SpoofStringLengthsElement )((char )fRawData + fRawData->fCFUStringLengths);
				642	}
				643	if (fRawData->fCFUStringTable != 0) {
				644	fCFUStrings = (UChar )((char )fRawData + fRawData->fCFUStringTable);
				645	}
				646
				647	if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
				648	fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
				649	(char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
				650	}
				651	if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
				652	fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
				653	(char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
				654	}
				655
				656	if (fRawData->fScriptSets != 0) {
				657	fScriptSets = (ScriptSet )((char )fRawData + fRawData->fScriptSets);
				658	}
				659	}
				660
				661
				662	SpoofData::~SpoofData() {
				663	utrie2_close(fAnyCaseTrie);
				664	fAnyCaseTrie = NULL;
				665	utrie2_close(fLowerCaseTrie);
				666	fLowerCaseTrie = NULL;
				667	if (fDataOwned) {
				668	uprv_free(fRawData);
				669	}
				670	fRawData = NULL;
				671	if (fUDM != NULL) {
				672	udata_close(fUDM);
				673	}
				674	fUDM = NULL;
				675	}
				676
				677
				678	void SpoofData::removeReference() {
				679	if (umtx_atomic_dec(&fRefCount) == 0) {
				680	delete this;
				681	}
				682	}
				683
				684
				685	SpoofData *SpoofData::addReference() {
				686	umtx_atomic_inc(&fRefCount);
				687	return this;
				688	}
				689
				690
				691	void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
				692	if (U_FAILURE(status)) {
				693	return NULL;
				694	}
				695	if (!fDataOwned) {
				696	U_ASSERT(FALSE);
				697	status = U_INTERNAL_PROGRAM_ERROR;
				698	return NULL;
				699	}
				700
				701	numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
				702	uint32_t returnOffset = fMemLimit;
				703	fMemLimit += numBytes;
				704	fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
				705	fRawData->fLength = fMemLimit;
				706	uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
				707	initPtrs(status);
				708	return (char *)fRawData + returnOffset;
				709	}
				710
				711
				712	U_NAMESPACE_END
				713
				714	U_NAMESPACE_USE
				715
				716	//-----------------------------------------------------------------------------
				717	//
				718	// uspoof_swap - byte swap and char encoding swap of spoof data
				719	//
				720	//-----------------------------------------------------------------------------
				721	U_CAPI int32_t U_EXPORT2
				722	uspoof_swap(const UDataSwapper ds, const void inData, int32_t length, void *outData,
				723	UErrorCode *status) {
				724
				725	if (status == NULL \|\| U_FAILURE(*status)) {
				726	return 0;
				727	}
				728	if(ds==NULL \|\| inData==NULL \|\| length<-1 \|\| (length>0 && outData==NULL)) {
				729	*status=U_ILLEGAL_ARGUMENT_ERROR;
				730	return 0;
				731	}
				732
				733	//
				734	// Check that the data header is for spoof data.
				735	// (Header contents are defined in gencfu.cpp)
				736	//
				737	const UDataInfo pInfo = (const UDataInfo )((const char *)inData+4);
				738	if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
				739	pInfo->dataFormat[1]==0x66 &&
				740	pInfo->dataFormat[2]==0x75 &&
				741	pInfo->dataFormat[3]==0x20 &&
				742	pInfo->formatVersion[0]==1 )) {
				743	udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
				744	"(format version %02x %02x %02x %02x) is not recognized\n",
				745	pInfo->dataFormat[0], pInfo->dataFormat[1],
				746	pInfo->dataFormat[2], pInfo->dataFormat[3],
				747	pInfo->formatVersion[0], pInfo->formatVersion[1],
				748	pInfo->formatVersion[2], pInfo->formatVersion[3]);
				749	*status=U_UNSUPPORTED_ERROR;
				750	return 0;
				751	}
				752
				753	//
				754	// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
				755	// header). This swap also conveniently gets us
				756	// the size of the ICU d.h., which lets us locate the start
				757	// of the uspoof specific data.
				758	//
				759	int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
				760
				761
				762	//
				763	// Get the Spoof Data Header, and check that it appears to be OK.
				764	//
				765	//
				766	const uint8_t inBytes =(const uint8_t )inData+headerSize;
				767	SpoofDataHeader spoofDH = (SpoofDataHeader )inBytes;
				768	if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC \|\|
				769	ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
				770	{
				771	udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
				772	*status=U_UNSUPPORTED_ERROR;
				773	return 0;
				774	}
				775
				776	//
				777	// Prefight operation? Just return the size
				778	//
				779	int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
				780	int32_t totalSize = headerSize + spoofDataLength;
				781	if (length < 0) {
				782	return totalSize;
				783	}
				784
				785	//
				786	// Check that length passed in is consistent with length from Spoof data header.
				787	//
				788	if (length < totalSize) {
				789	udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
				790	spoofDataLength);
				791	*status=U_INDEX_OUTOFBOUNDS_ERROR;
				792	return 0;
				793	}
				794
				795
				796	//
				797	// Swap the Data. Do the data itself first, then the Spoof Data Header, because
				798	// we need to reference the header to locate the data, and an
				799	// inplace swap of the header leaves it unusable.
				800	//
				801	uint8_t outBytes = (uint8_t )outData + headerSize;
				802	SpoofDataHeader outputDH = (SpoofDataHeader )outBytes;
				803
				804	int32_t sectionStart;
				805	int32_t sectionLength;
				806
				807	//
				808	// If not swapping in place, zero out the output buffer before starting.
				809	// Gaps may exist between the individual sections, and these must be zeroed in
				810	// the output buffer. The simplest way to do that is to just zero the whole thing.
				811	//
				812	if (inBytes != outBytes) {
				813	uprv_memset(outBytes, 0, spoofDataLength);
				814	}
				815
				816	// Confusables Keys Section (fCFUKeys)
				817	sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
				818	sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
				819	ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				820
				821	// String Index Section
				822	sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
				823	sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
				824	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				825
				826	// String Table Section
				827	sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
				828	sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
				829	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				830
				831	// String Lengths Section
				832	sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
				833	sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
				834	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				835
				836	// Any Case Trie
				837	sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
				838	sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
				839	utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				840
				841	// Lower Case Trie
				842	sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
				843	sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
				844	utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				845
				846	// Script Sets. The data is an array of int32_t
				847	sectionStart = ds->readUInt32(spoofDH->fScriptSets);
				848	sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
				849	ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				850
				851	// And, last, swap the header itself.
				852	// int32_t fMagic // swap this
				853	// uint8_t fFormatVersion[4] // Do not swap this, just copy
				854	// int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
				855	//
				856	uint32_t magic = ds->readUInt32(spoofDH->fMagic);
				857	ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
				858
				859	if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
				860	uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
				861	}
				862	// swap starting at fLength
				863	ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
				864
				865	return totalSize;
				866	}
				867
				868	#endif
				869
				870