Blame - source/i18n/uspoof_impl.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 9c662f8048336e70581300ba3f418c4a662ae945 [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1	/*
				2	**********************************************************************
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	3	* Copyright (C) 2008-2014, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	4	* Corporation and others. All Rights Reserved.
				5	**********************************************************************
				6	*/
				7
				8	#include "unicode/utypes.h"
				9	#include "unicode/uspoof.h"
				10	#include "unicode/uchar.h"
				11	#include "unicode/uniset.h"
				12	#include "unicode/utf16.h"
				13	#include "utrie2.h"
				14	#include "cmemory.h"
				15	#include "cstring.h"
				16	#include "identifier_info.h"
				17	#include "scriptset.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	18	#include "umutex.h"
				19	#include "udataswp.h"
				20	#include "uassert.h"
				21	#include "uspoof_impl.h"
				22
				23	#if !UCONFIG_NO_NORMALIZATION
				24
				25
				26	U_NAMESPACE_BEGIN
				27
				28	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
				29
				30	SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
				31	fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
				32	fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
				33	if (U_FAILURE(status)) {
				34	return;
				35	}
				36	fSpoofData = data;
				37	fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
				38
				39	UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
				40	allowedCharsSet->freeze();
				41	fAllowedCharsSet = allowedCharsSet;
				42	fAllowedLocales = uprv_strdup("");
				43	if (fAllowedCharsSet == NULL \|\| fAllowedLocales == NULL) {
				44	status = U_MEMORY_ALLOCATION_ERROR;
				45	return;
				46	}
				47	fMagic = USPOOF_MAGIC;
				48	}
				49
				50
				51	SpoofImpl::SpoofImpl() :
				52	fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
				53	fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
				54	UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
				55	allowedCharsSet->freeze();
				56	fAllowedCharsSet = allowedCharsSet;
				57	fAllowedLocales = uprv_strdup("");
				58	fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
				59	}
				60
				61
				62	// Copy Constructor, used by the user level clone() function.
				63	SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
				64	fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
				65	fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
				66	if (U_FAILURE(status)) {
				67	return;
				68	}
				69	fMagic = src.fMagic;
				70	fChecks = src.fChecks;
				71	if (src.fSpoofData != NULL) {
				72	fSpoofData = src.fSpoofData->addReference();
				73	}
				74	fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
				75	if (fAllowedCharsSet == NULL) {
				76	status = U_MEMORY_ALLOCATION_ERROR;
				77	}
				78	fAllowedLocales = uprv_strdup(src.fAllowedLocales);
				79	fRestrictionLevel = src.fRestrictionLevel;
				80	}
				81
				82	SpoofImpl::~SpoofImpl() {
				83	fMagic = 0; // head off application errors by preventing use of
				84	// of deleted objects.
				85	if (fSpoofData != NULL) {
				86	fSpoofData->removeReference(); // Will delete if refCount goes to zero.
				87	}
				88	delete fAllowedCharsSet;
				89	uprv_free((void *)fAllowedLocales);
				90	delete fCachedIdentifierInfo;
				91	}
				92
				93	//
				94	// Incoming parameter check on Status and the SpoofChecker object
				95	// received from the C API.
				96	//
				97	const SpoofImpl SpoofImpl::validateThis(const USpoofChecker sc, UErrorCode &status) {
				98	if (U_FAILURE(status)) {
				99	return NULL;
				100	}
				101	if (sc == NULL) {
				102	status = U_ILLEGAL_ARGUMENT_ERROR;
				103	return NULL;
				104	}
				105	SpoofImpl This = (SpoofImpl )sc;
				106	if (This->fMagic != USPOOF_MAGIC \|\|
				107	This->fSpoofData == NULL) {
				108	status = U_INVALID_FORMAT_ERROR;
				109	return NULL;
				110	}
				111	if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
				112	return NULL;
				113	}
				114	return This;
				115	}
				116
				117	SpoofImpl SpoofImpl::validateThis(USpoofChecker sc, UErrorCode &status) {
				118	return const_cast<SpoofImpl *>
				119	(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
				120	}
				121
				122
				123
				124	//--------------------------------------------------------------------------------------
				125	//
				126	// confusableLookup() This is the heart of the confusable skeleton generation
				127	// implementation.
				128	//
				129	// Given a source character, produce the corresponding
				130	// replacement character(s), appending them to the dest string.
				131	//
				132	//---------------------------------------------------------------------------------------
				133	int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
				134
				135	// Binary search the spoof data key table for the inChar
				136	int32_t *low = fSpoofData->fCFUKeys;
				137	int32_t *mid = NULL;
				138	int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
				139	UChar32 midc;
				140	do {
				141	int32_t delta = ((int32_t)(limit-low))/2;
				142	mid = low + delta;
				143	midc = *mid & 0x1fffff;
				144	if (inChar == midc) {
				145	goto foundChar;
				146	} else if (inChar < midc) {
				147	limit = mid;
				148	} else {
				149	low = mid;
				150	}
				151	} while (low < limit-1);
				152	mid = low;
				153	midc = *mid & 0x1fffff;
				154	if (inChar != midc) {
				155	// Char not found. It maps to itself.
				156	int i = 0;
				157	dest.append(inChar);
				158	return i;
				159	}
				160	foundChar:
				161	int32_t keyFlags = *mid & 0xff000000;
				162	if ((keyFlags & tableMask) == 0) {
				163	// We found the right key char, but the entry doesn't pertain to the
				164	// table we need. See if there is an adjacent key that does
				165	if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
				166	int32_t *altMid;
				167	for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
				168	keyFlags = *altMid & 0xff000000;
				169	if (keyFlags & tableMask) {
				170	mid = altMid;
				171	goto foundKey;
				172	}
				173	}
				174	for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
				175	keyFlags = *altMid & 0xff000000;
				176	if (keyFlags & tableMask) {
				177	mid = altMid;
				178	goto foundKey;
				179	}
				180	}
				181	}
				182	// No key entry for this char & table.
				183	// The input char maps to itself.
				184	int i = 0;
				185	dest.append(inChar);
				186	return i;
				187	}
				188
				189	foundKey:
				190	int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
				191	int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
				192
				193	// Value is either a UChar (for strings of length 1) or
				194	// an index into the string table (for longer strings)
				195	uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
				196	if (stringLen == 1) {
				197	dest.append((UChar)value);
				198	return 1;
				199	}
				200
				201	// String length of 4 from the above lookup is used for all strings of length >= 4.
				202	// For these, get the real length from the string lengths table,
				203	// which maps string table indexes to lengths.
				204	// All strings of the same length are stored contiguously in the string table.
				205	// 'value' from the lookup above is the starting index for the desired string.
				206
				207	int32_t ix;
				208	if (stringLen == 4) {
				209	int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
				210	for (ix = 0; ix < stringLengthsLimit; ix++) {
				211	if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
				212	stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
				213	break;
				214	}
				215	}
				216	U_ASSERT(ix < stringLengthsLimit);
				217	}
				218
				219	U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
				220	UChar *src = &fSpoofData->fCFUStrings[value];
				221	dest.append(src, stringLen);
				222	return stringLen;
				223	}
				224
				225
				226	//---------------------------------------------------------------------------------------
				227	//
				228	// wholeScriptCheck()
				229	//
				230	// Input text is already normalized to NFD
				231	// Return the set of scripts, each of which can represent something that is
				232	// confusable with the input text. The script of the input text
				233	// is included; input consisting of characters from a single script will
				234	// always produce a result consisting of a set containing that script.
				235	//
				236	//---------------------------------------------------------------------------------------
				237	void SpoofImpl::wholeScriptCheck(
				238	const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
				239
				240	UTrie2 *table =
				241	(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
				242	result->setAll();
				243	int32_t length = text.length();
				244	for (int32_t inputIdx=0; inputIdx < length;) {
				245	UChar32 c = text.char32At(inputIdx);
				246	inputIdx += U16_LENGTH(c);
				247	uint32_t index = utrie2_get32(table, c);
				248	if (index == 0) {
				249	// No confusables in another script for this char.
				250	// TODO: we should change the data to have sets with just the single script
				251	// bit for the script of this char. Gets rid of this special case.
				252	// Until then, grab the script from the char and intersect it with the set.
				253	UScriptCode cpScript = uscript_getScript(c, &status);
				254	U_ASSERT(cpScript > USCRIPT_INHERITED);
				255	result->intersect(cpScript, status);
				256	} else if (index == 1) {
				257	// Script == Common or Inherited. Nothing to do.
				258	} else {
				259	result->intersect(fSpoofData->fScriptSets[index]);
				260	}
				261	}
				262	}
				263
				264
				265	void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
				266	UnicodeSet allowedChars;
				267	UnicodeSet *tmpSet = NULL;
				268	const char *locStart = localesList;
				269	const char *locEnd = NULL;
				270	const char *localesListEnd = localesList + uprv_strlen(localesList);
				271	int32_t localeListCount = 0; // Number of locales provided by caller.
				272
				273	// Loop runs once per locale from the localesList, a comma separated list of locales.
				274	do {
				275	locEnd = uprv_strchr(locStart, ',');
				276	if (locEnd == NULL) {
				277	locEnd = localesListEnd;
				278	}
				279	while (*locStart == ' ') {
				280	locStart++;
				281	}
				282	const char *trimmedEnd = locEnd-1;
				283	while (trimmedEnd > locStart && *trimmedEnd == ' ') {
				284	trimmedEnd--;
				285	}
				286	if (trimmedEnd <= locStart) {
				287	break;
				288	}
				289	const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
				290	localeListCount++;
				291
				292	// We have one locale from the locales list.
				293	// Add the script chars for this locale to the accumulating set of allowed chars.
				294	// If the locale is no good, we will be notified back via status.
				295	addScriptChars(locale, &allowedChars, status);
				296	uprv_free((void *)locale);
				297	if (U_FAILURE(status)) {
				298	break;
				299	}
				300	locStart = locEnd + 1;
				301	} while (locStart < localesListEnd);
				302
				303	// If our caller provided an empty list of locales, we disable the allowed characters checking
				304	if (localeListCount == 0) {
				305	uprv_free((void *)fAllowedLocales);
				306	fAllowedLocales = uprv_strdup("");
				307	tmpSet = new UnicodeSet(0, 0x10ffff);
				308	if (fAllowedLocales == NULL \|\| tmpSet == NULL) {
				309	status = U_MEMORY_ALLOCATION_ERROR;
				310	return;
				311	}
				312	tmpSet->freeze();
				313	delete fAllowedCharsSet;
				314	fAllowedCharsSet = tmpSet;
				315	fChecks &= ~USPOOF_CHAR_LIMIT;
				316	return;
				317	}
				318
				319
				320	// Add all common and inherited characters to the set of allowed chars.
				321	UnicodeSet tempSet;
				322	tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
				323	allowedChars.addAll(tempSet);
				324	tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
				325	allowedChars.addAll(tempSet);
				326
				327	// If anything went wrong, we bail out without changing
				328	// the state of the spoof checker.
				329	if (U_FAILURE(status)) {
				330	return;
				331	}
				332
				333	// Store the updated spoof checker state.
				334	tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
				335	const char *tmpLocalesList = uprv_strdup(localesList);
				336	if (tmpSet == NULL \|\| tmpLocalesList == NULL) {
				337	status = U_MEMORY_ALLOCATION_ERROR;
				338	return;
				339	}
				340	uprv_free((void *)fAllowedLocales);
				341	fAllowedLocales = tmpLocalesList;
				342	tmpSet->freeze();
				343	delete fAllowedCharsSet;
				344	fAllowedCharsSet = tmpSet;
				345	fChecks \|= USPOOF_CHAR_LIMIT;
				346	}
				347
				348
				349	const char * SpoofImpl::getAllowedLocales(UErrorCode &/status/) {
				350	return fAllowedLocales;
				351	}
				352
				353
				354	// Given a locale (a language), add all the characters from all of the scripts used with that language
				355	// to the allowedChars UnicodeSet
				356
				357	void SpoofImpl::addScriptChars(const char locale, UnicodeSet allowedChars, UErrorCode &status) {
				358	UScriptCode scripts[30];
				359
				360	int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
				361	if (U_FAILURE(status)) {
				362	return;
				363	}
				364	if (status == U_USING_DEFAULT_WARNING) {
				365	status = U_ILLEGAL_ARGUMENT_ERROR;
				366	return;
				367	}
				368	UnicodeSet tmpSet;
				369	int32_t i;
				370	for (i=0; i<numScripts; i++) {
				371	tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
				372	allowedChars->addAll(tmpSet);
				373	}
				374	}
				375
				376
				377	// Convert a text format hex number. Utility function used by builder code. Static.
				378	// Input: UChar *string text. Output: a UChar32
				379	// Input has been pre-checked, and will have no non-hex chars.
				380	// The number must fall in the code point range of 0..0x10ffff
				381	// Static Function.
				382	UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
				383	if (U_FAILURE(status)) {
				384	return 0;
				385	}
				386	U_ASSERT(limit-start > 0);
				387	uint32_t val = 0;
				388	int i;
				389	for (i=start; i<limit; i++) {
				390	int digitVal = s[i] - 0x30;
				391	if (digitVal>9) {
				392	digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
				393	}
				394	if (digitVal>15) {
				395	digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
				396	}
				397	U_ASSERT(digitVal <= 0xf);
				398	val <<= 4;
				399	val += digitVal;
				400	}
				401	if (val > 0x10ffff) {
				402	status = U_PARSE_ERROR;
				403	val = 0;
				404	}
				405	return (UChar32)val;
				406	}
				407
				408	// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
				409	// Maintain a one-element cache, which is sufficient to avoid repeatedly
				410	// creating new ones unless we get multi-thread concurrency in spoof
				411	// check operations, which should be statistically uncommon.
				412
				413	// These functions are used in place of new & delete of an IdentifierInfo.
				414	// They will recycle the IdentifierInfo when possible.
				415	// They are logically const, and used within const functions that must be thread safe.
				416	IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
				417	IdentifierInfo *returnIdInfo = NULL;
				418	if (U_FAILURE(status)) {
				419	return returnIdInfo;
				420	}
				421	SpoofImpl nonConstThis = const_cast<SpoofImpl >(this);
				422	{
				423	Mutex m;
				424	returnIdInfo = nonConstThis->fCachedIdentifierInfo;
				425	nonConstThis->fCachedIdentifierInfo = NULL;
				426	}
				427	if (returnIdInfo == NULL) {
				428	returnIdInfo = new IdentifierInfo(status);
				429	if (U_SUCCESS(status) && returnIdInfo == NULL) {
				430	status = U_MEMORY_ALLOCATION_ERROR;
				431	}
				432	if (U_FAILURE(status) && returnIdInfo != NULL) {
				433	delete returnIdInfo;
				434	returnIdInfo = NULL;
				435	}
				436	}
				437	return returnIdInfo;
				438	}
				439
				440
				441	void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
				442	if (idInfo != NULL) {
				443	SpoofImpl nonConstThis = const_cast<SpoofImpl >(this);
				444	{
				445	Mutex m;
				446	if (nonConstThis->fCachedIdentifierInfo == NULL) {
				447	nonConstThis->fCachedIdentifierInfo = idInfo;
				448	idInfo = NULL;
				449	}
				450	}
				451	delete idInfo;
				452	}
				453	}
				454
				455
				456
				457
				458	//----------------------------------------------------------------------------------------------
				459	//
				460	// class SpoofData Implementation
				461	//
				462	//----------------------------------------------------------------------------------------------
				463
				464
				465	UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
				466	if (U_FAILURE(status) \|\|
				467	rawData == NULL \|\|
				468	rawData->fMagic != USPOOF_MAGIC \|\|
				469	rawData->fFormatVersion[0] > 1 \|\|
				470	rawData->fFormatVersion[1] > 0) {
				471	status = U_INVALID_FORMAT_ERROR;
				472	return FALSE;
				473	}
				474	return TRUE;
				475	}
				476
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	477	static UBool U_CALLCONV
				478	spoofDataIsAcceptable(void *context,
				479	const char * /* type /, const char /name/,
				480	const UDataInfo *pInfo) {
				481	if(
				482	pInfo->size >= 20 &&
				483	pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
				484	pInfo->charsetFamily == U_CHARSET_FAMILY &&
				485	pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
				486	pInfo->dataFormat[1] == 0x66 &&
				487	pInfo->dataFormat[2] == 0x75 &&
				488	pInfo->dataFormat[3] == 0x20 &&
				489	pInfo->formatVersion[0] == 1
				490	) {
				491	UVersionInfo version = static_cast<UVersionInfo >(context);
				492	if(version != NULL) {
				493	uprv_memcpy(version, pInfo->dataVersion, 4);
				494	}
				495	return TRUE;
				496	} else {
				497	return FALSE;
				498	}
				499	}
				500
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	501	//
				502	// SpoofData::getDefault() - return a wrapper around the spoof data that is
				503	// baked into the default ICU data.
				504	//
				505	SpoofData *SpoofData::getDefault(UErrorCode &status) {
				506	// TODO: Cache it. Lazy create, keep until cleanup.
				507
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	508	UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
				509	spoofDataIsAcceptable,
				510	NULL, // context, would receive dataVersion if supplied.
				511	&status);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	512	if (U_FAILURE(status)) {
				513	return NULL;
				514	}
				515	SpoofData *This = new SpoofData(udm, status);
				516	if (U_FAILURE(status)) {
				517	delete This;
				518	return NULL;
				519	}
				520	if (This == NULL) {
				521	status = U_MEMORY_ALLOCATION_ERROR;
				522	}
				523	return This;
				524	}
				525
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	526	SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
				527	{
				528	reset();
				529	if (U_FAILURE(status)) {
				530	return;
				531	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	532	fUDM = udm;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	533	// fRawData is non-const because it may be constructed by the data builder.
				534	fRawData = reinterpret_cast<SpoofDataHeader *>(
				535	const_cast<void *>(udata_getMemory(udm)));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	536	validateDataVersion(fRawData, status);
				537	initPtrs(status);
				538	}
				539
				540
				541	SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
				542	{
				543	reset();
				544	if (U_FAILURE(status)) {
				545	return;
				546	}
				547	if ((size_t)length < sizeof(SpoofDataHeader)) {
				548	status = U_INVALID_FORMAT_ERROR;
				549	return;
				550	}
				551	void ncData = const_cast<void >(data);
				552	fRawData = static_cast<SpoofDataHeader *>(ncData);
				553	if (length < fRawData->fLength) {
				554	status = U_INVALID_FORMAT_ERROR;
				555	return;
				556	}
				557	validateDataVersion(fRawData, status);
				558	initPtrs(status);
				559	}
				560
				561
				562	// Spoof Data constructor for use from data builder.
				563	// Initializes a new, empty data area that will be populated later.
				564	SpoofData::SpoofData(UErrorCode &status) {
				565	reset();
				566	if (U_FAILURE(status)) {
				567	return;
				568	}
				569	fDataOwned = true;
				570	fRefCount = 1;
				571
				572	// The spoof header should already be sized to be a multiple of 16 bytes.
				573	// Just in case it's not, round it up.
				574	uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
				575	U_ASSERT(initialSize == sizeof(SpoofDataHeader));
				576
				577	fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
				578	fMemLimit = initialSize;
				579	if (fRawData == NULL) {
				580	status = U_MEMORY_ALLOCATION_ERROR;
				581	return;
				582	}
				583	uprv_memset(fRawData, 0, initialSize);
				584
				585	fRawData->fMagic = USPOOF_MAGIC;
				586	fRawData->fFormatVersion[0] = 1;
				587	fRawData->fFormatVersion[1] = 0;
				588	fRawData->fFormatVersion[2] = 0;
				589	fRawData->fFormatVersion[3] = 0;
				590	initPtrs(status);
				591	}
				592
				593	// reset() - initialize all fields.
				594	// Should be updated if any new fields are added.
				595	// Called by constructors to put things in a known initial state.
				596	void SpoofData::reset() {
				597	fRawData = NULL;
				598	fDataOwned = FALSE;
				599	fUDM = NULL;
				600	fMemLimit = 0;
				601	fRefCount = 1;
				602	fCFUKeys = NULL;
				603	fCFUValues = NULL;
				604	fCFUStringLengths = NULL;
				605	fCFUStrings = NULL;
				606	fAnyCaseTrie = NULL;
				607	fLowerCaseTrie = NULL;
				608	fScriptSets = NULL;
				609	}
				610
				611
				612	// SpoofData::initPtrs()
				613	// Initialize the pointers to the various sections of the raw data.
				614	//
				615	// This function is used both during the Trie building process (multiple
				616	// times, as the individual data sections are added), and
				617	// during the opening of a Spoof Checker from prebuilt data.
				618	//
				619	// The pointers for non-existent data sections (identified by an offset of 0)
				620	// are set to NULL.
				621	//
				622	// Note: During building the data, adding each new data section
				623	// reallocs the raw data area, which likely relocates it, which
				624	// in turn requires reinitializing all of the pointers into it, hence
				625	// multiple calls to this function during building.
				626	//
				627	void SpoofData::initPtrs(UErrorCode &status) {
				628	fCFUKeys = NULL;
				629	fCFUValues = NULL;
				630	fCFUStringLengths = NULL;
				631	fCFUStrings = NULL;
				632	if (U_FAILURE(status)) {
				633	return;
				634	}
				635	if (fRawData->fCFUKeys != 0) {
				636	fCFUKeys = (int32_t )((char )fRawData + fRawData->fCFUKeys);
				637	}
				638	if (fRawData->fCFUStringIndex != 0) {
				639	fCFUValues = (uint16_t )((char )fRawData + fRawData->fCFUStringIndex);
				640	}
				641	if (fRawData->fCFUStringLengths != 0) {
				642	fCFUStringLengths = (SpoofStringLengthsElement )((char )fRawData + fRawData->fCFUStringLengths);
				643	}
				644	if (fRawData->fCFUStringTable != 0) {
				645	fCFUStrings = (UChar )((char )fRawData + fRawData->fCFUStringTable);
				646	}
				647
				648	if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
				649	fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
				650	(char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
				651	}
				652	if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
				653	fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
				654	(char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
				655	}
				656
				657	if (fRawData->fScriptSets != 0) {
				658	fScriptSets = (ScriptSet )((char )fRawData + fRawData->fScriptSets);
				659	}
				660	}
				661
				662
				663	SpoofData::~SpoofData() {
				664	utrie2_close(fAnyCaseTrie);
				665	fAnyCaseTrie = NULL;
				666	utrie2_close(fLowerCaseTrie);
				667	fLowerCaseTrie = NULL;
				668	if (fDataOwned) {
				669	uprv_free(fRawData);
				670	}
				671	fRawData = NULL;
				672	if (fUDM != NULL) {
				673	udata_close(fUDM);
				674	}
				675	fUDM = NULL;
				676	}
				677
				678
				679	void SpoofData::removeReference() {
				680	if (umtx_atomic_dec(&fRefCount) == 0) {
				681	delete this;
				682	}
				683	}
				684
				685
				686	SpoofData *SpoofData::addReference() {
				687	umtx_atomic_inc(&fRefCount);
				688	return this;
				689	}
				690
				691
				692	void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
				693	if (U_FAILURE(status)) {
				694	return NULL;
				695	}
				696	if (!fDataOwned) {
				697	U_ASSERT(FALSE);
				698	status = U_INTERNAL_PROGRAM_ERROR;
				699	return NULL;
				700	}
				701
				702	numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
				703	uint32_t returnOffset = fMemLimit;
				704	fMemLimit += numBytes;
				705	fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
				706	fRawData->fLength = fMemLimit;
				707	uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
				708	initPtrs(status);
				709	return (char *)fRawData + returnOffset;
				710	}
				711
				712
				713	U_NAMESPACE_END
				714
				715	U_NAMESPACE_USE
				716
				717	//-----------------------------------------------------------------------------
				718	//
				719	// uspoof_swap - byte swap and char encoding swap of spoof data
				720	//
				721	//-----------------------------------------------------------------------------
				722	U_CAPI int32_t U_EXPORT2
				723	uspoof_swap(const UDataSwapper ds, const void inData, int32_t length, void *outData,
				724	UErrorCode *status) {
				725
				726	if (status == NULL \|\| U_FAILURE(*status)) {
				727	return 0;
				728	}
				729	if(ds==NULL \|\| inData==NULL \|\| length<-1 \|\| (length>0 && outData==NULL)) {
				730	*status=U_ILLEGAL_ARGUMENT_ERROR;
				731	return 0;
				732	}
				733
				734	//
				735	// Check that the data header is for spoof data.
				736	// (Header contents are defined in gencfu.cpp)
				737	//
				738	const UDataInfo pInfo = (const UDataInfo )((const char *)inData+4);
				739	if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
				740	pInfo->dataFormat[1]==0x66 &&
				741	pInfo->dataFormat[2]==0x75 &&
				742	pInfo->dataFormat[3]==0x20 &&
				743	pInfo->formatVersion[0]==1 )) {
				744	udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
				745	"(format version %02x %02x %02x %02x) is not recognized\n",
				746	pInfo->dataFormat[0], pInfo->dataFormat[1],
				747	pInfo->dataFormat[2], pInfo->dataFormat[3],
				748	pInfo->formatVersion[0], pInfo->formatVersion[1],
				749	pInfo->formatVersion[2], pInfo->formatVersion[3]);
				750	*status=U_UNSUPPORTED_ERROR;
				751	return 0;
				752	}
				753
				754	//
				755	// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
				756	// header). This swap also conveniently gets us
				757	// the size of the ICU d.h., which lets us locate the start
				758	// of the uspoof specific data.
				759	//
				760	int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
				761
				762
				763	//
				764	// Get the Spoof Data Header, and check that it appears to be OK.
				765	//
				766	//
				767	const uint8_t inBytes =(const uint8_t )inData+headerSize;
				768	SpoofDataHeader spoofDH = (SpoofDataHeader )inBytes;
				769	if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC \|\|
				770	ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
				771	{
				772	udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
				773	*status=U_UNSUPPORTED_ERROR;
				774	return 0;
				775	}
				776
				777	//
				778	// Prefight operation? Just return the size
				779	//
				780	int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
				781	int32_t totalSize = headerSize + spoofDataLength;
				782	if (length < 0) {
				783	return totalSize;
				784	}
				785
				786	//
				787	// Check that length passed in is consistent with length from Spoof data header.
				788	//
				789	if (length < totalSize) {
				790	udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
				791	spoofDataLength);
				792	*status=U_INDEX_OUTOFBOUNDS_ERROR;
				793	return 0;
				794	}
				795
				796
				797	//
				798	// Swap the Data. Do the data itself first, then the Spoof Data Header, because
				799	// we need to reference the header to locate the data, and an
				800	// inplace swap of the header leaves it unusable.
				801	//
				802	uint8_t outBytes = (uint8_t )outData + headerSize;
				803	SpoofDataHeader outputDH = (SpoofDataHeader )outBytes;
				804
				805	int32_t sectionStart;
				806	int32_t sectionLength;
				807
				808	//
				809	// If not swapping in place, zero out the output buffer before starting.
				810	// Gaps may exist between the individual sections, and these must be zeroed in
				811	// the output buffer. The simplest way to do that is to just zero the whole thing.
				812	//
				813	if (inBytes != outBytes) {
				814	uprv_memset(outBytes, 0, spoofDataLength);
				815	}
				816
				817	// Confusables Keys Section (fCFUKeys)
				818	sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
				819	sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
				820	ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				821
				822	// String Index Section
				823	sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
				824	sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
				825	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				826
				827	// String Table Section
				828	sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
				829	sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
				830	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				831
				832	// String Lengths Section
				833	sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
				834	sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
				835	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				836
				837	// Any Case Trie
				838	sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
				839	sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
				840	utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				841
				842	// Lower Case Trie
				843	sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
				844	sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
				845	utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				846
				847	// Script Sets. The data is an array of int32_t
				848	sectionStart = ds->readUInt32(spoofDH->fScriptSets);
				849	sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
				850	ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
				851
				852	// And, last, swap the header itself.
				853	// int32_t fMagic // swap this
				854	// uint8_t fFormatVersion[4] // Do not swap this, just copy
				855	// int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
				856	//
				857	uint32_t magic = ds->readUInt32(spoofDH->fMagic);
				858	ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
				859
				860	if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
				861	uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
				862	}
				863	// swap starting at fLength
				864	ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
				865
				866	return totalSize;
				867	}
				868
				869	#endif
				870
				871