Blame - source/common/unames.cpp - chromium.googlesource.com/chromium/deps/icu

blob: a2035c48dfc9c6bf4a658ed4b2cdbee06bdb3776 [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1	/*
				2	******************************************************************************
				3	*
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	4	* Copyright (C) 1999-2014, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	5	* Corporation and others. All Rights Reserved.
				6	*
				7	******************************************************************************
				8	* file name: unames.c
				9	* encoding: US-ASCII
				10	* tab size: 8 (not used)
				11	* indentation:4
				12	*
				13	* created on: 1999oct04
				14	* created by: Markus W. Scherer
				15	*/
				16
				17	#include "unicode/utypes.h"
				18	#include "unicode/putil.h"
				19	#include "unicode/uchar.h"
				20	#include "unicode/udata.h"
				21	#include "unicode/utf.h"
				22	#include "unicode/utf16.h"
				23	#include "uassert.h"
				24	#include "ustr_imp.h"
				25	#include "umutex.h"
				26	#include "cmemory.h"
				27	#include "cstring.h"
				28	#include "ucln_cmn.h"
				29	#include "udataswp.h"
				30	#include "uprops.h"
				31
				32	U_NAMESPACE_BEGIN
				33
				34	/* prototypes ------------------------------------------------------------- */
				35
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	36	static const char DATA_NAME[] = "unames";
				37	static const char DATA_TYPE[] = "icu";
				38
				39	#define GROUP_SHIFT 5
				40	#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
				41	#define GROUP_MASK (LINES_PER_GROUP-1)
				42
				43	/*
				44	* This struct was replaced by explicitly accessing equivalent
				45	* fields from triples of uint16_t.
				46	* The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
				47	* which broke the assumption that sizeof(Group)==6 and that the ++ operator
				48	* would advance by 6 bytes (3 uint16_t).
				49	*
				50	* We can't just change the data structure because it's loaded from a data file,
				51	* and we don't want to make it less compact, so we changed the access code.
				52	*
				53	* For details see ICU tickets 6331 and 6008.
				54	typedef struct {
				55	uint16_t groupMSB,
				56	offsetHigh, offsetLow; / * avoid padding * /
				57	} Group;
				58	*/
				59	enum {
				60	GROUP_MSB,
				61	GROUP_OFFSET_HIGH,
				62	GROUP_OFFSET_LOW,
				63	GROUP_LENGTH
				64	};
				65
				66	/*
				67	* Get the 32-bit group offset.
				68	* @param group (const uint16_t *) pointer to a Group triple of uint16_t
				69	* @return group offset (int32_t)
				70	*/
				71	#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16\|(group)[GROUP_OFFSET_LOW])
				72
				73	#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
				74	#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
				75
				76	typedef struct {
				77	uint32_t start, end;
				78	uint8_t type, variant;
				79	uint16_t size;
				80	} AlgorithmicRange;
				81
				82	typedef struct {
				83	uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
				84	} UCharNames;
				85
				86	/*
				87	* Get the groups table from a UCharNames struct.
				88	* The groups table consists of one uint16_t groupCount followed by
				89	* groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
				90	* and the comment for the old struct Group above.
				91	*
				92	* @param names (const UCharNames *) pointer to the UCharNames indexes
				93	* @return (const uint16_t *) pointer to the groups table
				94	*/
				95	#define GET_GROUPS(names) (const uint16_t )((const char )names+names->groupsOffset)
				96
				97	typedef struct {
				98	const char *otherName;
				99	UChar32 code;
				100	} FindName;
				101
				102	#define DO_FIND_NAME NULL
				103
				104	static UDataMemory *uCharNamesData=NULL;
				105	static UCharNames *uCharNames=NULL;
				106	static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
				107
				108	/*
				109	* Maximum length of character names (regular & 1.0).
				110	*/
				111	static int32_t gMaxNameLength=0;
				112
				113	/*
				114	* Set of chars used in character names (regular & 1.0).
				115	* Chars are platform-dependent (can be EBCDIC).
				116	*/
				117	static uint32_t gNameSet[8]={ 0 };
				118
				119	#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
				120	#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
				121	#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
				122
				123	#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
				124
				125	static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
				126	"unassigned",
				127	"uppercase letter",
				128	"lowercase letter",
				129	"titlecase letter",
				130	"modifier letter",
				131	"other letter",
				132	"non spacing mark",
				133	"enclosing mark",
				134	"combining spacing mark",
				135	"decimal digit number",
				136	"letter number",
				137	"other number",
				138	"space separator",
				139	"line separator",
				140	"paragraph separator",
				141	"control",
				142	"format",
				143	"private use area",
				144	"surrogate",
				145	"dash punctuation",
				146	"start punctuation",
				147	"end punctuation",
				148	"connector punctuation",
				149	"other punctuation",
				150	"math symbol",
				151	"currency symbol",
				152	"modifier symbol",
				153	"other symbol",
				154	"initial punctuation",
				155	"final punctuation",
				156	"noncharacter",
				157	"lead surrogate",
				158	"trail surrogate"
				159	};
				160
				161	/* implementation ----------------------------------------------------------- */
				162
				163	static UBool U_CALLCONV unames_cleanup(void)
				164	{
				165	if(uCharNamesData) {
				166	udata_close(uCharNamesData);
				167	uCharNamesData = NULL;
				168	}
				169	if(uCharNames) {
				170	uCharNames = NULL;
				171	}
				172	gCharNamesInitOnce.reset();
				173	gMaxNameLength=0;
				174	return TRUE;
				175	}
				176
				177	static UBool U_CALLCONV
				178	isAcceptable(void * /context/,
				179	const char * /type/, const char * /name/,
				180	const UDataInfo *pInfo) {
				181	return (UBool)(
				182	pInfo->size>=20 &&
				183	pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
				184	pInfo->charsetFamily==U_CHARSET_FAMILY &&
				185	pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
				186	pInfo->dataFormat[1]==0x6e &&
				187	pInfo->dataFormat[2]==0x61 &&
				188	pInfo->dataFormat[3]==0x6d &&
				189	pInfo->formatVersion[0]==1);
				190	}
				191
				192	static void U_CALLCONV
				193	loadCharNames(UErrorCode &status) {
				194	U_ASSERT(uCharNamesData == NULL);
				195	U_ASSERT(uCharNames == NULL);
				196
				197	uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
				198	if(U_FAILURE(status)) {
				199	uCharNamesData = NULL;
				200	} else {
				201	uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
				202	}
				203	ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
				204	}
				205
				206
				207	static UBool
				208	isDataLoaded(UErrorCode *pErrorCode) {
				209	umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
				210	return U_SUCCESS(*pErrorCode);
				211	}
				212
				213	#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
				214	if((bufferLength)>0) { \
				215	*(buffer)++=c; \
				216	--(bufferLength); \
				217	} \
				218	++(bufferPos); \
				219	}
				220
				221	#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
				222
				223	/*
				224	* Important: expandName() and compareName() are almost the same -
				225	* apply fixes to both.
				226	*
				227	* UnicodeData.txt uses ';' as a field separator, so no
				228	* field can contain ';' as part of its contents.
				229	* In unames.dat, it is marked as token[';']==-1 only if the
				230	* semicolon is used in the data file - which is iff we
				231	* have Unicode 1.0 names or ISO comments or aliases.
				232	* So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
				233	* although we know that it will never be part of a name.
				234	*/
				235	static uint16_t
				236	expandName(UCharNames *names,
				237	const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
				238	char *buffer, uint16_t bufferLength) {
				239	uint16_t tokens=(uint16_t )names+8;
				240	uint16_t token, tokenCount=*tokens++, bufferPos=0;
				241	uint8_t tokenStrings=(uint8_t )names+names->tokenStringOffset;
				242	uint8_t c;
				243
				244	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				245	/*
				246	* skip the modern name if it is not requested _and_
				247	* if the semicolon byte value is a character, not a token number
				248	*/
				249	if((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				250	int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
				251	do {
				252	while(nameLength>0) {
				253	--nameLength;
				254	if(*name++==';') {
				255	break;
				256	}
				257	}
				258	} while(--fieldIndex>0);
				259	} else {
				260	/*
				261	* the semicolon byte value is a token number, therefore
				262	* only modern names are stored in unames.dat and there is no
				263	* such requested alternate name here
				264	*/
				265	nameLength=0;
				266	}
				267	}
				268
				269	/* write each letter directly, and write a token word per token */
				270	while(nameLength>0) {
				271	--nameLength;
				272	c=*name++;
				273
				274	if(c>=tokenCount) {
				275	if(c!=';') {
				276	/* implicit letter */
				277	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				278	} else {
				279	/* finished */
				280	break;
				281	}
				282	} else {
				283	token=tokens[c];
				284	if(token==(uint16_t)(-2)) {
				285	/* this is a lead byte for a double-byte token */
				286	token=tokens[c<<8\|*name++];
				287	--nameLength;
				288	}
				289	if(token==(uint16_t)(-1)) {
				290	if(c!=';') {
				291	/* explicit letter */
				292	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				293	} else {
				294	/* stop, but skip the semicolon if we are seeking
				295	extended names and there was no 2.0 name but there
				296	is a 1.0 name. */
				297	if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
				298	if ((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				299	continue;
				300	}
				301	}
				302	/* finished */
				303	break;
				304	}
				305	} else {
				306	/* write token word */
				307	uint8_t *tokenString=tokenStrings+token;
				308	while((c=*tokenString++)!=0) {
				309	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				310	}
				311	}
				312	}
				313	}
				314
				315	/* zero-terminate */
				316	if(bufferLength>0) {
				317	*buffer=0;
				318	}
				319
				320	return bufferPos;
				321	}
				322
				323	/*
				324	* compareName() is almost the same as expandName() except that it compares
				325	* the currently expanded name to an input name.
				326	* It returns the match/no match result as soon as possible.
				327	*/
				328	static UBool
				329	compareName(UCharNames *names,
				330	const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
				331	const char *otherName) {
				332	uint16_t tokens=(uint16_t )names+8;
				333	uint16_t token, tokenCount=*tokens++;
				334	uint8_t tokenStrings=(uint8_t )names+names->tokenStringOffset;
				335	uint8_t c;
				336	const char *origOtherName = otherName;
				337
				338	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				339	/*
				340	* skip the modern name if it is not requested _and_
				341	* if the semicolon byte value is a character, not a token number
				342	*/
				343	if((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				344	int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
				345	do {
				346	while(nameLength>0) {
				347	--nameLength;
				348	if(*name++==';') {
				349	break;
				350	}
				351	}
				352	} while(--fieldIndex>0);
				353	} else {
				354	/*
				355	* the semicolon byte value is a token number, therefore
				356	* only modern names are stored in unames.dat and there is no
				357	* such requested alternate name here
				358	*/
				359	nameLength=0;
				360	}
				361	}
				362
				363	/* compare each letter directly, and compare a token word per token */
				364	while(nameLength>0) {
				365	--nameLength;
				366	c=*name++;
				367
				368	if(c>=tokenCount) {
				369	if(c!=';') {
				370	/* implicit letter */
				371	if((char)c!=*otherName++) {
				372	return FALSE;
				373	}
				374	} else {
				375	/* finished */
				376	break;
				377	}
				378	} else {
				379	token=tokens[c];
				380	if(token==(uint16_t)(-2)) {
				381	/* this is a lead byte for a double-byte token */
				382	token=tokens[c<<8\|*name++];
				383	--nameLength;
				384	}
				385	if(token==(uint16_t)(-1)) {
				386	if(c!=';') {
				387	/* explicit letter */
				388	if((char)c!=*otherName++) {
				389	return FALSE;
				390	}
				391	} else {
				392	/* stop, but skip the semicolon if we are seeking
				393	extended names and there was no 2.0 name but there
				394	is a 1.0 name. */
				395	if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
				396	if ((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				397	continue;
				398	}
				399	}
				400	/* finished */
				401	break;
				402	}
				403	} else {
				404	/* write token word */
				405	uint8_t *tokenString=tokenStrings+token;
				406	while((c=*tokenString++)!=0) {
				407	if((char)c!=*otherName++) {
				408	return FALSE;
				409	}
				410	}
				411	}
				412	}
				413	}
				414
				415	/* complete match? */
				416	return (UBool)(*otherName==0);
				417	}
				418
				419	static uint8_t getCharCat(UChar32 cp) {
				420	uint8_t cat;
				421
				422	if (U_IS_UNICODE_NONCHAR(cp)) {
				423	return U_NONCHARACTER_CODE_POINT;
				424	}
				425
				426	if ((cat = u_charType(cp)) == U_SURROGATE) {
				427	cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
				428	}
				429
				430	return cat;
				431	}
				432
				433	static const char *getCharCatName(UChar32 cp) {
				434	uint8_t cat = getCharCat(cp);
				435
				436	/* Return unknown if the table of names above is not up to
				437	date. */
				438
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	439	if (cat >= UPRV_LENGTHOF(charCatNames)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	440	return "unknown";
				441	} else {
				442	return charCatNames[cat];
				443	}
				444	}
				445
				446	static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
				447	const char *catname = getCharCatName(code);
				448	uint16_t length = 0;
				449
				450	UChar32 cp;
				451	int ndigits, i;
				452
				453	WRITE_CHAR(buffer, bufferLength, length, '<');
				454	while (catname[length - 1]) {
				455	WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
				456	}
				457	WRITE_CHAR(buffer, bufferLength, length, '-');
				458	for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
				459	;
				460	if (ndigits < 4)
				461	ndigits = 4;
				462	for (cp = code, i = ndigits; (cp \|\| i > 0) && bufferLength; cp >>= 4, bufferLength--) {
				463	uint8_t v = (uint8_t)(cp & 0xf);
				464	buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
				465	}
				466	buffer += ndigits;
				467	length += ndigits;
				468	WRITE_CHAR(buffer, bufferLength, length, '>');
				469
				470	return length;
				471	}
				472
				473	/*
				474	* getGroup() does a binary search for the group that contains the
				475	* Unicode code point "code".
				476	* The return value is always a valid Group* that may contain "code"
				477	* or else is the highest group before "code".
				478	* If the lowest group is after "code", then that one is returned.
				479	*/
				480	static const uint16_t *
				481	getGroup(UCharNames *names, uint32_t code) {
				482	const uint16_t *groups=GET_GROUPS(names);
				483	uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
				484	start=0,
				485	limit=*groups++,
				486	number;
				487
				488	/* binary search for the group of names that contains the one for code */
				489	while(start<limit-1) {
				490	number=(uint16_t)((start+limit)/2);
				491	if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
				492	limit=number;
				493	} else {
				494	start=number;
				495	}
				496	}
				497
				498	/* return this regardless of whether it is an exact match */
				499	return groups+start*GROUP_LENGTH;
				500	}
				501
				502	/*
				503	* expandGroupLengths() reads a block of compressed lengths of 32 strings and
				504	* expands them into offsets and lengths for each string.
				505	* Lengths are stored with a variable-width encoding in consecutive nibbles:
				506	* If a nibble<0xc, then it is the length itself (0=empty string).
				507	* If a nibble>=0xc, then it forms a length value with the following nibble.
				508	* Calculation see below.
				509	* The offsets and lengths arrays must be at least 33 (one more) long because
				510	* there is no check here at the end if the last nibble is still used.
				511	*/
				512	static const uint8_t *
				513	expandGroupLengths(const uint8_t *s,
				514	uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
				515	/* read the lengths of the 32 strings in this group and get each string's offset */
				516	uint16_t i=0, offset=0, length=0;
				517	uint8_t lengthByte;
				518
				519	/* all 32 lengths must be read to get the offset of the first group string */
				520	while(i<LINES_PER_GROUP) {
				521	lengthByte=*s++;
				522
				523	/* read even nibble - MSBs of lengthByte */
				524	if(length>=12) {
				525	/* double-nibble length spread across two bytes */
				526	length=(uint16_t)(((length&0x3)<<4\|lengthByte>>4)+12);
				527	lengthByte&=0xf;
				528	} else if((lengthByte /* &0xf0 */)>=0xc0) {
				529	/* double-nibble length spread across this one byte */
				530	length=(uint16_t)((lengthByte&0x3f)+12);
				531	} else {
				532	/* single-nibble length in MSBs */
				533	length=(uint16_t)(lengthByte>>4);
				534	lengthByte&=0xf;
				535	}
				536
				537	*offsets++=offset;
				538	*lengths++=length;
				539
				540	offset+=length;
				541	++i;
				542
				543	/* read odd nibble - LSBs of lengthByte */
				544	if((lengthByte&0xf0)==0) {
				545	/* this nibble was not consumed for a double-nibble length above */
				546	length=lengthByte;
				547	if(length<12) {
				548	/* single-nibble length in LSBs */
				549	*offsets++=offset;
				550	*lengths++=length;
				551
				552	offset+=length;
				553	++i;
				554	}
				555	} else {
				556	length=0; /* prevent double-nibble detection in the next iteration */
				557	}
				558	}
				559
				560	/* now, s is at the first group string */
				561	return s;
				562	}
				563
				564	static uint16_t
				565	expandGroupName(UCharNames names, const uint16_t group,
				566	uint16_t lineNumber, UCharNameChoice nameChoice,
				567	char *buffer, uint16_t bufferLength) {
				568	uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
				569	const uint8_t s=(uint8_t )names+names->groupStringOffset+GET_GROUP_OFFSET(group);
				570	s=expandGroupLengths(s, offsets, lengths);
				571	return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
				572	buffer, bufferLength);
				573	}
				574
				575	static uint16_t
				576	getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
				577	char *buffer, uint16_t bufferLength) {
				578	const uint16_t *group=getGroup(names, code);
				579	if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
				580	return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
				581	buffer, bufferLength);
				582	} else {
				583	/* group not found */
				584	/* zero-terminate */
				585	if(bufferLength>0) {
				586	*buffer=0;
				587	}
				588	return 0;
				589	}
				590	}
				591
				592	/*
				593	* enumGroupNames() enumerates all the names in a 32-group
				594	* and either calls the enumerator function or finds a given input name.
				595	*/
				596	static UBool
				597	enumGroupNames(UCharNames names, const uint16_t group,
				598	UChar32 start, UChar32 end,
				599	UEnumCharNamesFn fn, void context,
				600	UCharNameChoice nameChoice) {
				601	uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
				602	const uint8_t s=(uint8_t )names+names->groupStringOffset+GET_GROUP_OFFSET(group);
				603
				604	s=expandGroupLengths(s, offsets, lengths);
				605	if(fn!=DO_FIND_NAME) {
				606	char buffer[200];
				607	uint16_t length;
				608
				609	while(start<=end) {
				610	length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
				611	if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
				612	buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
				613	}
				614	/* here, we assume that the buffer is large enough */
				615	if(length>0) {
				616	if(!fn(context, start, nameChoice, buffer, length)) {
				617	return FALSE;
				618	}
				619	}
				620	++start;
				621	}
				622	} else {
				623	const char otherName=((FindName )context)->otherName;
				624	while(start<=end) {
				625	if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
				626	((FindName *)context)->code=start;
				627	return FALSE;
				628	}
				629	++start;
				630	}
				631	}
				632	return TRUE;
				633	}
				634
				635	/*
				636	* enumExtNames enumerate extended names.
				637	* It only needs to do it if it is called with a real function and not
				638	* with the dummy DO_FIND_NAME, because u_charFromName() does a check
				639	* for extended names by itself.
				640	*/
				641	static UBool
				642	enumExtNames(UChar32 start, UChar32 end,
				643	UEnumCharNamesFn fn, void context)
				644	{
				645	if(fn!=DO_FIND_NAME) {
				646	char buffer[200];
				647	uint16_t length;
				648
				649	while(start<=end) {
				650	buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
				651	/* here, we assume that the buffer is large enough */
				652	if(length>0) {
				653	if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
				654	return FALSE;
				655	}
				656	}
				657	++start;
				658	}
				659	}
				660
				661	return TRUE;
				662	}
				663
				664	static UBool
				665	enumNames(UCharNames *names,
				666	UChar32 start, UChar32 limit,
				667	UEnumCharNamesFn fn, void context,
				668	UCharNameChoice nameChoice) {
				669	uint16_t startGroupMSB, endGroupMSB, groupCount;
				670	const uint16_t group, groupLimit;
				671
				672	startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
				673	endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
				674
				675	/* find the group that contains start, or the highest before it */
				676	group=getGroup(names, start);
				677
				678	if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
				679	/* enumerate synthetic names between start and the group start */
				680	UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
				681	if(extLimit>limit) {
				682	extLimit=limit;
				683	}
				684	if(!enumExtNames(start, extLimit-1, fn, context)) {
				685	return FALSE;
				686	}
				687	start=extLimit;
				688	}
				689
				690	if(startGroupMSB==endGroupMSB) {
				691	if(startGroupMSB==group[GROUP_MSB]) {
				692	/* if start and limit-1 are in the same group, then enumerate only in that one */
				693	return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
				694	}
				695	} else {
				696	const uint16_t *groups=GET_GROUPS(names);
				697	groupCount=*groups++;
				698	groupLimit=groups+groupCount*GROUP_LENGTH;
				699
				700	if(startGroupMSB==group[GROUP_MSB]) {
				701	/* enumerate characters in the partial start group */
				702	if((start&GROUP_MASK)!=0) {
				703	if(!enumGroupNames(names, group,
				704	start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
				705	fn, context, nameChoice)) {
				706	return FALSE;
				707	}
				708	group=NEXT_GROUP(group); /* continue with the next group */
				709	}
				710	} else if(startGroupMSB>group[GROUP_MSB]) {
				711	/* make sure that we start enumerating with the first group after start */
				712	const uint16_t *nextGroup=NEXT_GROUP(group);
				713	if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
				714	UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
				715	if (end > limit) {
				716	end = limit;
				717	}
				718	if (!enumExtNames(start, end - 1, fn, context)) {
				719	return FALSE;
				720	}
				721	}
				722	group=nextGroup;
				723	}
				724
				725	/* enumerate entire groups between the start- and end-groups */
				726	while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
				727	const uint16_t *nextGroup;
				728	start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
				729	if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
				730	return FALSE;
				731	}
				732	nextGroup=NEXT_GROUP(group);
				733	if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
				734	UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
				735	if (end > limit) {
				736	end = limit;
				737	}
				738	if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
				739	return FALSE;
				740	}
				741	}
				742	group=nextGroup;
				743	}
				744
				745	/* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
				746	if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
				747	return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
				748	} else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
				749	UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
				750	if (next > start) {
				751	start = next;
				752	}
				753	} else {
				754	return TRUE;
				755	}
				756	}
				757
				758	/* we have not found a group, which means everything is made of
				759	extended names. */
				760	if (nameChoice == U_EXTENDED_CHAR_NAME) {
				761	if (limit > UCHAR_MAX_VALUE + 1) {
				762	limit = UCHAR_MAX_VALUE + 1;
				763	}
				764	return enumExtNames(start, limit - 1, fn, context);
				765	}
				766
				767	return TRUE;
				768	}
				769
				770	static uint16_t
				771	writeFactorSuffix(const uint16_t *factors, uint16_t count,
				772	const char s, / suffix elements */
				773	uint32_t code,
				774	uint16_t indexes[8], /* output fields from here */
				775	const char elementBases[8], const char elements[8],
				776	char *buffer, uint16_t bufferLength) {
				777	uint16_t i, factor, bufferPos=0;
				778	char c;
				779
				780	/* write elements according to the factors */
				781
				782	/*
				783	* the factorized elements are determined by modulo arithmetic
				784	* with the factors of this algorithm
				785	*
				786	* note that for fewer operations, count is decremented here
				787	*/
				788	--count;
				789	for(i=count; i>0; --i) {
				790	factor=factors[i];
				791	indexes[i]=(uint16_t)(code%factor);
				792	code/=factor;
				793	}
				794	/*
				795	* we don't need to calculate the last modulus because start<=code<=end
				796	* guarantees here that code<=factors[0]
				797	*/
				798	indexes[0]=(uint16_t)code;
				799
				800	/* write each element */
				801	for(;;) {
				802	if(elementBases!=NULL) {
				803	*elementBases++=s;
				804	}
				805
				806	/* skip indexes[i] strings */
				807	factor=indexes[i];
				808	while(factor>0) {
				809	while(*s++!=0) {}
				810	--factor;
				811	}
				812	if(elements!=NULL) {
				813	*elements++=s;
				814	}
				815
				816	/* write element */
				817	while((c=*s++)!=0) {
				818	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				819	}
				820
				821	/* we do not need to perform the rest of this loop for i==count - break here */
				822	if(i>=count) {
				823	break;
				824	}
				825
				826	/* skip the rest of the strings for this factors[i] */
				827	factor=(uint16_t)(factors[i]-indexes[i]-1);
				828	while(factor>0) {
				829	while(*s++!=0) {}
				830	--factor;
				831	}
				832
				833	++i;
				834	}
				835
				836	/* zero-terminate */
				837	if(bufferLength>0) {
				838	*buffer=0;
				839	}
				840
				841	return bufferPos;
				842	}
				843
				844	/*
				845	* Important:
				846	* Parts of findAlgName() are almost the same as some of getAlgName().
				847	* Fixes must be applied to both.
				848	*/
				849	static uint16_t
				850	getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
				851	char *buffer, uint16_t bufferLength) {
				852	uint16_t bufferPos=0;
				853
				854	/* Only the normative character name can be algorithmic. */
				855	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				856	/* zero-terminate */
				857	if(bufferLength>0) {
				858	*buffer=0;
				859	}
				860	return 0;
				861	}
				862
				863	switch(range->type) {
				864	case 0: {
				865	/* name = prefix hex-digits */
				866	const char s=(const char )(range+1);
				867	char c;
				868
				869	uint16_t i, count;
				870
				871	/* copy prefix */
				872	while((c=*s++)!=0) {
				873	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				874	}
				875
				876	/* write hexadecimal code point value */
				877	count=range->variant;
				878
				879	/* zero-terminate */
				880	if(count<bufferLength) {
				881	buffer[count]=0;
				882	}
				883
				884	for(i=count; i>0;) {
				885	if(--i<bufferLength) {
				886	c=(char)(code&0xf);
				887	if(c<10) {
				888	c+='0';
				889	} else {
				890	c+='A'-10;
				891	}
				892	buffer[i]=c;
				893	}
				894	code>>=4;
				895	}
				896
				897	bufferPos+=count;
				898	break;
				899	}
				900	case 1: {
				901	/* name = prefix factorized-elements */
				902	uint16_t indexes[8];
				903	const uint16_t factors=(const uint16_t )(range+1);
				904	uint16_t count=range->variant;
				905	const char s=(const char )(factors+count);
				906	char c;
				907
				908	/* copy prefix */
				909	while((c=*s++)!=0) {
				910	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				911	}
				912
				913	bufferPos+=writeFactorSuffix(factors, count,
				914	s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
				915	break;
				916	}
				917	default:
				918	/* undefined type */
				919	/* zero-terminate */
				920	if(bufferLength>0) {
				921	*buffer=0;
				922	}
				923	break;
				924	}
				925
				926	return bufferPos;
				927	}
				928
				929	/*
				930	* Important: enumAlgNames() and findAlgName() are almost the same.
				931	* Any fix must be applied to both.
				932	*/
				933	static UBool
				934	enumAlgNames(AlgorithmicRange *range,
				935	UChar32 start, UChar32 limit,
				936	UEnumCharNamesFn fn, void context,
				937	UCharNameChoice nameChoice) {
				938	char buffer[200];
				939	uint16_t length;
				940
				941	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				942	return TRUE;
				943	}
				944
				945	switch(range->type) {
				946	case 0: {
				947	char s, end;
				948	char c;
				949
				950	/* get the full name of the start character */
				951	length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
				952	if(length<=0) {
				953	return TRUE;
				954	}
				955
				956	/* call the enumerator function with this first character */
				957	if(!fn(context, start, nameChoice, buffer, length)) {
				958	return FALSE;
				959	}
				960
				961	/* go to the end of the name; all these names have the same length */
				962	end=buffer;
				963	while(*end!=0) {
				964	++end;
				965	}
				966
				967	/* enumerate the rest of the names */
				968	while(++start<limit) {
				969	/* increment the hexadecimal number on a character-basis */
				970	s=end;
				971	for (;;) {
				972	c=*--s;
				973	if(('0'<=c && c<'9') \|\| ('A'<=c && c<'F')) {
				974	*s=(char)(c+1);
				975	break;
				976	} else if(c=='9') {
				977	*s='A';
				978	break;
				979	} else if(c=='F') {
				980	*s='0';
				981	}
				982	}
				983
				984	if(!fn(context, start, nameChoice, buffer, length)) {
				985	return FALSE;
				986	}
				987	}
				988	break;
				989	}
				990	case 1: {
				991	uint16_t indexes[8];
				992	const char elementBases[8], elements[8];
				993	const uint16_t factors=(const uint16_t )(range+1);
				994	uint16_t count=range->variant;
				995	const char s=(const char )(factors+count);
				996	char suffix, t;
				997	uint16_t prefixLength, i, idx;
				998
				999	char c;
				1000
				1001	/* name = prefix factorized-elements */
				1002
				1003	/* copy prefix */
				1004	suffix=buffer;
				1005	prefixLength=0;
				1006	while((c=*s++)!=0) {
				1007	*suffix++=c;
				1008	++prefixLength;
				1009	}
				1010
				1011	/* append the suffix of the start character */
				1012	length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
				1013	s, (uint32_t)start-range->start,
				1014	indexes, elementBases, elements,
				1015	suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
				1016
				1017	/* call the enumerator function with this first character */
				1018	if(!fn(context, start, nameChoice, buffer, length)) {
				1019	return FALSE;
				1020	}
				1021
				1022	/* enumerate the rest of the names */
				1023	while(++start<limit) {
				1024	/* increment the indexes in lexical order bound by the factors */
				1025	i=count;
				1026	for (;;) {
				1027	idx=(uint16_t)(indexes[--i]+1);
				1028	if(idx<factors[i]) {
				1029	/* skip one index and its element string */
				1030	indexes[i]=idx;
				1031	s=elements[i];
				1032	while(*s++!=0) {
				1033	}
				1034	elements[i]=s;
				1035	break;
				1036	} else {
				1037	/* reset this index to 0 and its element string to the first one */
				1038	indexes[i]=0;
				1039	elements[i]=elementBases[i];
				1040	}
				1041	}
				1042
				1043	/* to make matters a little easier, just append all elements to the suffix */
				1044	t=suffix;
				1045	length=prefixLength;
				1046	for(i=0; i<count; ++i) {
				1047	s=elements[i];
				1048	while((c=*s++)!=0) {
				1049	*t++=c;
				1050	++length;
				1051	}
				1052	}
				1053	/* zero-terminate */
				1054	*t=0;
				1055
				1056	if(!fn(context, start, nameChoice, buffer, length)) {
				1057	return FALSE;
				1058	}
				1059	}
				1060	break;
				1061	}
				1062	default:
				1063	/* undefined type */
				1064	break;
				1065	}
				1066
				1067	return TRUE;
				1068	}
				1069
				1070	/*
				1071	* findAlgName() is almost the same as enumAlgNames() except that it
				1072	* returns the code point for a name if it fits into the range.
				1073	* It returns 0xffff otherwise.
				1074	*/
				1075	static UChar32
				1076	findAlgName(AlgorithmicRange range, UCharNameChoice nameChoice, const char otherName) {
				1077	UChar32 code;
				1078
				1079	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				1080	return 0xffff;
				1081	}
				1082
				1083	switch(range->type) {
				1084	case 0: {
				1085	/* name = prefix hex-digits */
				1086	const char s=(const char )(range+1);
				1087	char c;
				1088
				1089	uint16_t i, count;
				1090
				1091	/* compare prefix */
				1092	while((c=*s++)!=0) {
				1093	if((char)c!=*otherName++) {
				1094	return 0xffff;
				1095	}
				1096	}
				1097
				1098	/* read hexadecimal code point value */
				1099	count=range->variant;
				1100	code=0;
				1101	for(i=0; i<count; ++i) {
				1102	c=*otherName++;
				1103	if('0'<=c && c<='9') {
				1104	code=(code<<4)\|(c-'0');
				1105	} else if('A'<=c && c<='F') {
				1106	code=(code<<4)\|(c-'A'+10);
				1107	} else {
				1108	return 0xffff;
				1109	}
				1110	}
				1111
				1112	/* does it fit into the range? */
				1113	if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
				1114	return code;
				1115	}
				1116	break;
				1117	}
				1118	case 1: {
				1119	char buffer[64];
				1120	uint16_t indexes[8];
				1121	const char elementBases[8], elements[8];
				1122	const uint16_t factors=(const uint16_t )(range+1);
				1123	uint16_t count=range->variant;
				1124	const char s=(const char )(factors+count), *t;
				1125	UChar32 start, limit;
				1126	uint16_t i, idx;
				1127
				1128	char c;
				1129
				1130	/* name = prefix factorized-elements */
				1131
				1132	/* compare prefix */
				1133	while((c=*s++)!=0) {
				1134	if((char)c!=*otherName++) {
				1135	return 0xffff;
				1136	}
				1137	}
				1138
				1139	start=(UChar32)range->start;
				1140	limit=(UChar32)(range->end+1);
				1141
				1142	/* initialize the suffix elements for enumeration; indexes should all be set to 0 */
				1143	writeFactorSuffix(factors, count, s, 0,
				1144	indexes, elementBases, elements, buffer, sizeof(buffer));
				1145
				1146	/* compare the first suffix */
				1147	if(0==uprv_strcmp(otherName, buffer)) {
				1148	return start;
				1149	}
				1150
				1151	/* enumerate and compare the rest of the suffixes */
				1152	while(++start<limit) {
				1153	/* increment the indexes in lexical order bound by the factors */
				1154	i=count;
				1155	for (;;) {
				1156	idx=(uint16_t)(indexes[--i]+1);
				1157	if(idx<factors[i]) {
				1158	/* skip one index and its element string */
				1159	indexes[i]=idx;
				1160	s=elements[i];
				1161	while(*s++!=0) {}
				1162	elements[i]=s;
				1163	break;
				1164	} else {
				1165	/* reset this index to 0 and its element string to the first one */
				1166	indexes[i]=0;
				1167	elements[i]=elementBases[i];
				1168	}
				1169	}
				1170
				1171	/* to make matters a little easier, just compare all elements of the suffix */
				1172	t=otherName;
				1173	for(i=0; i<count; ++i) {
				1174	s=elements[i];
				1175	while((c=*s++)!=0) {
				1176	if(c!=*t++) {
				1177	s=""; /* does not match */
				1178	i=99;
				1179	}
				1180	}
				1181	}
				1182	if(i<99 && *t==0) {
				1183	return start;
				1184	}
				1185	}
				1186	break;
				1187	}
				1188	default:
				1189	/* undefined type */
				1190	break;
				1191	}
				1192
				1193	return 0xffff;
				1194	}
				1195
				1196	/* sets of name characters, maximum name lengths ---------------------------- */
				1197
				1198	#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]\|=((uint32_t)1<<((uint8_t)c&0x1f)))
				1199	#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
				1200
				1201	static int32_t
				1202	calcStringSetLength(uint32_t set[8], const char *s) {
				1203	int32_t length=0;
				1204	char c;
				1205
				1206	while((c=*s++)!=0) {
				1207	SET_ADD(set, c);
				1208	++length;
				1209	}
				1210	return length;
				1211	}
				1212
				1213	static int32_t
				1214	calcAlgNameSetsLengths(int32_t maxNameLength) {
				1215	AlgorithmicRange *range;
				1216	uint32_t *p;
				1217	uint32_t rangeCount;
				1218	int32_t length;
				1219
				1220	/* enumerate algorithmic ranges */
				1221	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1222	rangeCount=*p;
				1223	range=(AlgorithmicRange *)(p+1);
				1224	while(rangeCount>0) {
				1225	switch(range->type) {
				1226	case 0:
				1227	/* name = prefix + (range->variant times) hex-digits */
				1228	/* prefix */
				1229	length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
				1230	if(length>maxNameLength) {
				1231	maxNameLength=length;
				1232	}
				1233	break;
				1234	case 1: {
				1235	/* name = prefix factorized-elements */
				1236	const uint16_t factors=(const uint16_t )(range+1);
				1237	const char *s;
				1238	int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
				1239
				1240	/* prefix length */
				1241	s=(const char *)(factors+count);
				1242	length=calcStringSetLength(gNameSet, s);
				1243	s+=length+1; /* start of factor suffixes */
				1244
				1245	/* get the set and maximum factor suffix length for each factor */
				1246	for(i=0; i<count; ++i) {
				1247	maxFactorLength=0;
				1248	for(factor=factors[i]; factor>0; --factor) {
				1249	factorLength=calcStringSetLength(gNameSet, s);
				1250	s+=factorLength+1;
				1251	if(factorLength>maxFactorLength) {
				1252	maxFactorLength=factorLength;
				1253	}
				1254	}
				1255	length+=maxFactorLength;
				1256	}
				1257
				1258	if(length>maxNameLength) {
				1259	maxNameLength=length;
				1260	}
				1261	break;
				1262	}
				1263	default:
				1264	/* unknown type */
				1265	break;
				1266	}
				1267
				1268	range=(AlgorithmicRange )((uint8_t )range+range->size);
				1269	--rangeCount;
				1270	}
				1271	return maxNameLength;
				1272	}
				1273
				1274	static int32_t
				1275	calcExtNameSetsLengths(int32_t maxNameLength) {
				1276	int32_t i, length;
				1277
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1278	for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1279	/*
				1280	* for each category, count the length of the category name
				1281	* plus 9=
				1282	* 2 for <>
				1283	* 1 for -
				1284	* 6 for most hex digits per code point
				1285	*/
				1286	length=9+calcStringSetLength(gNameSet, charCatNames[i]);
				1287	if(length>maxNameLength) {
				1288	maxNameLength=length;
				1289	}
				1290	}
				1291	return maxNameLength;
				1292	}
				1293
				1294	static int32_t
				1295	calcNameSetLength(const uint16_t tokens, uint16_t tokenCount, const uint8_t tokenStrings, int8_t *tokenLengths,
				1296	uint32_t set[8],
				1297	const uint8_t *pLine, const uint8_t lineLimit) {
				1298	const uint8_t line=pLine;
				1299	int32_t length=0, tokenLength;
				1300	uint16_t c, token;
				1301
				1302	while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
				1303	if(c>=tokenCount) {
				1304	/* implicit letter */
				1305	SET_ADD(set, c);
				1306	++length;
				1307	} else {
				1308	token=tokens[c];
				1309	if(token==(uint16_t)(-2)) {
				1310	/* this is a lead byte for a double-byte token */
				1311	c=c<<8\|*line++;
				1312	token=tokens[c];
				1313	}
				1314	if(token==(uint16_t)(-1)) {
				1315	/* explicit letter */
				1316	SET_ADD(set, c);
				1317	++length;
				1318	} else {
				1319	/* count token word */
				1320	if(tokenLengths!=NULL) {
				1321	/* use cached token length */
				1322	tokenLength=tokenLengths[c];
				1323	if(tokenLength==0) {
				1324	tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
				1325	tokenLengths[c]=(int8_t)tokenLength;
				1326	}
				1327	} else {
				1328	tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
				1329	}
				1330	length+=tokenLength;
				1331	}
				1332	}
				1333	}
				1334
				1335	*pLine=line;
				1336	return length;
				1337	}
				1338
				1339	static void
				1340	calcGroupNameSetsLengths(int32_t maxNameLength) {
				1341	uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
				1342
				1343	uint16_t tokens=(uint16_t )uCharNames+8;
				1344	uint16_t tokenCount=*tokens++;
				1345	uint8_t tokenStrings=(uint8_t )uCharNames+uCharNames->tokenStringOffset;
				1346
				1347	int8_t *tokenLengths;
				1348
				1349	const uint16_t *group;
				1350	const uint8_t s, line, *lineLimit;
				1351
				1352	int32_t groupCount, lineNumber, length;
				1353
				1354	tokenLengths=(int8_t *)uprv_malloc(tokenCount);
				1355	if(tokenLengths!=NULL) {
				1356	uprv_memset(tokenLengths, 0, tokenCount);
				1357	}
				1358
				1359	group=GET_GROUPS(uCharNames);
				1360	groupCount=*group++;
				1361
				1362	/* enumerate all groups */
				1363	while(groupCount>0) {
				1364	s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
				1365	s=expandGroupLengths(s, offsets, lengths);
				1366
				1367	/* enumerate all lines in each group */
				1368	for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
				1369	line=s+offsets[lineNumber];
				1370	length=lengths[lineNumber];
				1371	if(length==0) {
				1372	continue;
				1373	}
				1374
				1375	lineLimit=line+length;
				1376
				1377	/* read regular name */
				1378	length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
				1379	if(length>maxNameLength) {
				1380	maxNameLength=length;
				1381	}
				1382	if(line==lineLimit) {
				1383	continue;
				1384	}
				1385
				1386	/* read Unicode 1.0 name */
				1387	length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
				1388	if(length>maxNameLength) {
				1389	maxNameLength=length;
				1390	}
				1391	if(line==lineLimit) {
				1392	continue;
				1393	}
				1394
				1395	/* read ISO comment */
				1396	/length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);/
				1397	}
				1398
				1399	group=NEXT_GROUP(group);
				1400	--groupCount;
				1401	}
				1402
				1403	if(tokenLengths!=NULL) {
				1404	uprv_free(tokenLengths);
				1405	}
				1406
				1407	/* set gMax... - name length last for threading */
				1408	gMaxNameLength=maxNameLength;
				1409	}
				1410
				1411	static UBool
				1412	calcNameSetsLengths(UErrorCode *pErrorCode) {
				1413	static const char extChars[]="0123456789ABCDEF<>-";
				1414	int32_t i, maxNameLength;
				1415
				1416	if(gMaxNameLength!=0) {
				1417	return TRUE;
				1418	}
				1419
				1420	if(!isDataLoaded(pErrorCode)) {
				1421	return FALSE;
				1422	}
				1423
				1424	/* set hex digits, used in various names, and <>-, used in extended names */
				1425	for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
				1426	SET_ADD(gNameSet, extChars[i]);
				1427	}
				1428
				1429	/* set sets and lengths from algorithmic names */
				1430	maxNameLength=calcAlgNameSetsLengths(0);
				1431
				1432	/* set sets and lengths from extended names */
				1433	maxNameLength=calcExtNameSetsLengths(maxNameLength);
				1434
				1435	/* set sets and lengths from group names, set global maximum values */
				1436	calcGroupNameSetsLengths(maxNameLength);
				1437
				1438	return TRUE;
				1439	}
				1440
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1441	U_NAMESPACE_END
				1442
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1443	/* public API --------------------------------------------------------------- */
				1444
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1445	U_NAMESPACE_USE
				1446
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1447	U_CAPI int32_t U_EXPORT2
				1448	u_charName(UChar32 code, UCharNameChoice nameChoice,
				1449	char *buffer, int32_t bufferLength,
				1450	UErrorCode *pErrorCode) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1451	AlgorithmicRange *algRange;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1452	uint32_t *p;
				1453	uint32_t i;
				1454	int32_t length;
				1455
				1456	/* check the argument values */
				1457	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1458	return 0;
				1459	} else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT \|\|
				1460	bufferLength<0 \|\| (bufferLength>0 && buffer==NULL)
				1461	) {
				1462	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1463	return 0;
				1464	}
				1465
				1466	if((uint32_t)code>UCHAR_MAX_VALUE \|\| !isDataLoaded(pErrorCode)) {
				1467	return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
				1468	}
				1469
				1470	length=0;
				1471
				1472	/* try algorithmic names first */
				1473	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1474	i=*p;
				1475	algRange=(AlgorithmicRange *)(p+1);
				1476	while(i>0) {
				1477	if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
				1478	length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
				1479	break;
				1480	}
				1481	algRange=(AlgorithmicRange )((uint8_t )algRange+algRange->size);
				1482	--i;
				1483	}
				1484
				1485	if(i==0) {
				1486	if (nameChoice == U_EXTENDED_CHAR_NAME) {
				1487	length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
				1488	if (!length) {
				1489	/* extended character name */
				1490	length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
				1491	}
				1492	} else {
				1493	/* normal character name */
				1494	length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
				1495	}
				1496	}
				1497
				1498	return u_terminateChars(buffer, bufferLength, length, pErrorCode);
				1499	}
				1500
				1501	U_CAPI int32_t U_EXPORT2
				1502	u_getISOComment(UChar32 /c/,
				1503	char *dest, int32_t destCapacity,
				1504	UErrorCode *pErrorCode) {
				1505	/* check the argument values */
				1506	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1507	return 0;
				1508	} else if(destCapacity<0 \|\| (destCapacity>0 && dest==NULL)) {
				1509	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1510	return 0;
				1511	}
				1512
				1513	return u_terminateChars(dest, destCapacity, 0, pErrorCode);
				1514	}
				1515
				1516	U_CAPI UChar32 U_EXPORT2
				1517	u_charFromName(UCharNameChoice nameChoice,
				1518	const char *name,
				1519	UErrorCode *pErrorCode) {
				1520	char upper[120], lower[120];
				1521	FindName findName;
				1522	AlgorithmicRange *algRange;
				1523	uint32_t *p;
				1524	uint32_t i;
				1525	UChar32 cp = 0;
				1526	char c0;
				1527	UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
				1528
				1529	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1530	return error;
				1531	}
				1532
				1533	if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT \|\| name==NULL \|\| *name==0) {
				1534	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1535	return error;
				1536	}
				1537
				1538	if(!isDataLoaded(pErrorCode)) {
				1539	return error;
				1540	}
				1541
				1542	/* construct the uppercase and lowercase of the name first */
				1543	for(i=0; i<sizeof(upper); ++i) {
				1544	if((c0=*name++)!=0) {
				1545	upper[i]=uprv_toupper(c0);
				1546	lower[i]=uprv_tolower(c0);
				1547	} else {
				1548	upper[i]=lower[i]=0;
				1549	break;
				1550	}
				1551	}
				1552	if(i==sizeof(upper)) {
				1553	/* name too long, there is no such character */
				1554	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1555	return error;
				1556	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1557	// i==strlen(name)==strlen(lower)==strlen(upper)
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1558
				1559	/* try extended names first */
				1560	if (lower[0] == '<') {
				1561	if (nameChoice == U_EXTENDED_CHAR_NAME) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1562	// Parse a string like "<category-HHHH>" where HHHH is a hex code point.
				1563	if (lower[--i] == '>' && i >= 3 && lower[--i] != '-') {
				1564	while (i >= 3 && lower[--i] != '-') {}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1565
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1566	if (i >= 2 && lower[i] == '-') {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1567	uint32_t cIdx;
				1568
				1569	lower[i] = 0;
				1570
				1571	for (++i; lower[i] != '>'; ++i) {
				1572	if (lower[i] >= '0' && lower[i] <= '9') {
				1573	cp = (cp << 4) + lower[i] - '0';
				1574	} else if (lower[i] >= 'a' && lower[i] <= 'f') {
				1575	cp = (cp << 4) + lower[i] - 'a' + 10;
				1576	} else {
				1577	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1578	return error;
				1579	}
				1580	}
				1581
				1582	/* Now validate the category name.
				1583	We could use a binary search, or a trie, if
				1584	we really wanted to. */
				1585
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	1586	for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1587
				1588	if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
				1589	if (getCharCat(cp) == cIdx) {
				1590	return cp;
				1591	}
				1592	break;
				1593	}
				1594	}
				1595	}
				1596	}
				1597	}
				1598
				1599	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1600	return error;
				1601	}
				1602
				1603	/* try algorithmic names now */
				1604	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1605	i=*p;
				1606	algRange=(AlgorithmicRange *)(p+1);
				1607	while(i>0) {
				1608	if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
				1609	return cp;
				1610	}
				1611	algRange=(AlgorithmicRange )((uint8_t )algRange+algRange->size);
				1612	--i;
				1613	}
				1614
				1615	/* normal character name */
				1616	findName.otherName=upper;
				1617	findName.code=error;
				1618	enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
				1619	if (findName.code == error) {
				1620	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1621	}
				1622	return findName.code;
				1623	}
				1624
				1625	U_CAPI void U_EXPORT2
				1626	u_enumCharNames(UChar32 start, UChar32 limit,
				1627	UEnumCharNamesFn *fn,
				1628	void *context,
				1629	UCharNameChoice nameChoice,
				1630	UErrorCode *pErrorCode) {
				1631	AlgorithmicRange *algRange;
				1632	uint32_t *p;
				1633	uint32_t i;
				1634
				1635	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1636	return;
				1637	}
				1638
				1639	if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT \|\| fn==NULL) {
				1640	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1641	return;
				1642	}
				1643
				1644	if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
				1645	limit = UCHAR_MAX_VALUE + 1;
				1646	}
				1647	if((uint32_t)start>=(uint32_t)limit) {
				1648	return;
				1649	}
				1650
				1651	if(!isDataLoaded(pErrorCode)) {
				1652	return;
				1653	}
				1654
				1655	/* interleave the data-driven ones with the algorithmic ones */
				1656	/* iterate over all algorithmic ranges; assume that they are in ascending order */
				1657	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1658	i=*p;
				1659	algRange=(AlgorithmicRange *)(p+1);
				1660	while(i>0) {
				1661	/* enumerate the character names before the current algorithmic range */
				1662	/* here: start<limit */
				1663	if((uint32_t)start<algRange->start) {
				1664	if((uint32_t)limit<=algRange->start) {
				1665	enumNames(uCharNames, start, limit, fn, context, nameChoice);
				1666	return;
				1667	}
				1668	if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
				1669	return;
				1670	}
				1671	start=(UChar32)algRange->start;
				1672	}
				1673	/* enumerate the character names in the current algorithmic range */
				1674	/* here: algRange->start<=start<limit */
				1675	if((uint32_t)start<=algRange->end) {
				1676	if((uint32_t)limit<=(algRange->end+1)) {
				1677	enumAlgNames(algRange, start, limit, fn, context, nameChoice);
				1678	return;
				1679	}
				1680	if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
				1681	return;
				1682	}
				1683	start=(UChar32)algRange->end+1;
				1684	}
				1685	/* continue to the next algorithmic range (here: start<limit) */
				1686	algRange=(AlgorithmicRange )((uint8_t )algRange+algRange->size);
				1687	--i;
				1688	}
				1689	/* enumerate the character names after the last algorithmic range */
				1690	enumNames(uCharNames, start, limit, fn, context, nameChoice);
				1691	}
				1692
				1693	U_CAPI int32_t U_EXPORT2
				1694	uprv_getMaxCharNameLength() {
				1695	UErrorCode errorCode=U_ZERO_ERROR;
				1696	if(calcNameSetsLengths(&errorCode)) {
				1697	return gMaxNameLength;
				1698	} else {
				1699	return 0;
				1700	}
				1701	}
				1702
				1703	/**
				1704	* Converts the char set cset into a Unicode set uset.
				1705	* @param cset Set of 256 bit flags corresponding to a set of chars.
				1706	* @param uset USet to receive characters. Existing contents are deleted.
				1707	*/
				1708	static void
				1709	charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
				1710	UChar us[256];
				1711	char cs[256];
				1712
				1713	int32_t i, length;
				1714	UErrorCode errorCode;
				1715
				1716	errorCode=U_ZERO_ERROR;
				1717
				1718	if(!calcNameSetsLengths(&errorCode)) {
				1719	return;
				1720	}
				1721
				1722	/* build a char string with all chars that are used in character names */
				1723	length=0;
				1724	for(i=0; i<256; ++i) {
				1725	if(SET_CONTAINS(cset, i)) {
				1726	cs[length++]=(char)i;
				1727	}
				1728	}
				1729
				1730	/* convert the char string to a UChar string */
				1731	u_charsToUChars(cs, us, length);
				1732
				1733	/* add each UChar to the USet */
				1734	for(i=0; i<length; ++i) {
				1735	if(us[i]!=0 \|\| cs[i]==0) { /* non-invariant chars become (UChar)0 */
				1736	sa->add(sa->set, us[i]);
				1737	}
				1738	}
				1739	}
				1740
				1741	/**
				1742	* Fills set with characters that are used in Unicode character names.
				1743	* @param set USet to receive characters.
				1744	*/
				1745	U_CAPI void U_EXPORT2
				1746	uprv_getCharNameCharacters(const USetAdder *sa) {
				1747	charSetToUSet(gNameSet, sa);
				1748	}
				1749
				1750	/* data swapping ------------------------------------------------------------ */
				1751
				1752	/*
				1753	* The token table contains non-negative entries for token bytes,
				1754	* and -1 for bytes that represent themselves in the data file's charset.
				1755	* -2 entries are used for lead bytes.
				1756	*
				1757	* Direct bytes (-1 entries) must be translated from the input charset family
				1758	* to the output charset family.
				1759	* makeTokenMap() writes a permutation mapping for this.
				1760	* Use it once for single-/lead-byte tokens and once more for all trail byte
				1761	* tokens. (';' is an unused trail byte marked with -1.)
				1762	*/
				1763	static void
				1764	makeTokenMap(const UDataSwapper *ds,
				1765	int16_t tokens[], uint16_t tokenCount,
				1766	uint8_t map[256],
				1767	UErrorCode *pErrorCode) {
				1768	UBool usedOutChar[256];
				1769	uint16_t i, j;
				1770	uint8_t c1, c2;
				1771
				1772	if(U_FAILURE(*pErrorCode)) {
				1773	return;
				1774	}
				1775
				1776	if(ds->inCharset==ds->outCharset) {
				1777	/* Same charset family: identity permutation */
				1778	for(i=0; i<256; ++i) {
				1779	map[i]=(uint8_t)i;
				1780	}
				1781	} else {
				1782	uprv_memset(map, 0, 256);
				1783	uprv_memset(usedOutChar, 0, 256);
				1784
				1785	if(tokenCount>256) {
				1786	tokenCount=256;
				1787	}
				1788
				1789	/* set the direct bytes (byte 0 always maps to itself) */
				1790	for(i=1; i<tokenCount; ++i) {
				1791	if(tokens[i]==-1) {
				1792	/* convert the direct byte character */
				1793	c1=(uint8_t)i;
				1794	ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
				1795	if(U_FAILURE(*pErrorCode)) {
				1796	udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
				1797	i, ds->inCharset);
				1798	return;
				1799	}
				1800
				1801	/* enter the converted character into the map and mark it used */
				1802	map[c1]=c2;
				1803	usedOutChar[c2]=TRUE;
				1804	}
				1805	}
				1806
				1807	/* set the mappings for the rest of the permutation */
				1808	for(i=j=1; i<tokenCount; ++i) {
				1809	/* set mappings that were not set for direct bytes */
				1810	if(map[i]==0) {
				1811	/* set an output byte value that was not used as an output byte above */
				1812	while(usedOutChar[j]) {
				1813	++j;
				1814	}
				1815	map[i]=(uint8_t)j++;
				1816	}
				1817	}
				1818
				1819	/*
				1820	* leave mappings at tokenCount and above unset if tokenCount<256
				1821	* because they won't be used
				1822	*/
				1823	}
				1824	}
				1825
				1826	U_CAPI int32_t U_EXPORT2
				1827	uchar_swapNames(const UDataSwapper *ds,
				1828	const void inData, int32_t length, void outData,
				1829	UErrorCode *pErrorCode) {
				1830	const UDataInfo *pInfo;
				1831	int32_t headerSize;
				1832
				1833	const uint8_t *inBytes;
				1834	uint8_t *outBytes;
				1835
				1836	uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
				1837	offset, i, count, stringsCount;
				1838
				1839	const AlgorithmicRange *inRange;
				1840	AlgorithmicRange *outRange;
				1841
				1842	/* udata_swapDataHeader checks the arguments */
				1843	headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
				1844	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1845	return 0;
				1846	}
				1847
				1848	/* check data format and format version */
				1849	pInfo=(const UDataInfo )((const char )inData+4);
				1850	if(!(
				1851	pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
				1852	pInfo->dataFormat[1]==0x6e &&
				1853	pInfo->dataFormat[2]==0x61 &&
				1854	pInfo->dataFormat[3]==0x6d &&
				1855	pInfo->formatVersion[0]==1
				1856	)) {
				1857	udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
				1858	pInfo->dataFormat[0], pInfo->dataFormat[1],
				1859	pInfo->dataFormat[2], pInfo->dataFormat[3],
				1860	pInfo->formatVersion[0]);
				1861	*pErrorCode=U_UNSUPPORTED_ERROR;
				1862	return 0;
				1863	}
				1864
				1865	inBytes=(const uint8_t *)inData+headerSize;
				1866	outBytes=(uint8_t *)outData+headerSize;
				1867	if(length<0) {
				1868	algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
				1869	} else {
				1870	length-=headerSize;
				1871	if( length<20 \|\|
				1872	(uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
				1873	) {
				1874	udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
				1875	length);
				1876	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				1877	return 0;
				1878	}
				1879	}
				1880
				1881	if(length<0) {
				1882	/* preflighting: iterate through algorithmic ranges */
				1883	offset=algNamesOffset;
				1884	count=ds->readUInt32(((const uint32_t )(inBytes+offset)));
				1885	offset+=4;
				1886
				1887	for(i=0; i<count; ++i) {
				1888	inRange=(const AlgorithmicRange *)(inBytes+offset);
				1889	offset+=ds->readUInt16(inRange->size);
				1890	}
				1891	} else {
				1892	/* swap data */
				1893	const uint16_t *p;
				1894	uint16_t q, temp;
				1895
				1896	int16_t tokens[512];
				1897	uint16_t tokenCount;
				1898
				1899	uint8_t map[256], trailMap[256];
				1900
				1901	/* copy the data for inaccessible bytes */
				1902	if(inBytes!=outBytes) {
				1903	uprv_memcpy(outBytes, inBytes, length);
				1904	}
				1905
				1906	/* the initial 4 offsets first */
				1907	tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
				1908	groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
				1909	groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
				1910	ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
				1911
				1912	/*
				1913	* now the tokens table
				1914	* it needs to be permutated along with the compressed name strings
				1915	*/
				1916	p=(const uint16_t *)(inBytes+16);
				1917	q=(uint16_t *)(outBytes+16);
				1918
				1919	/* read and swap the tokenCount */
				1920	tokenCount=ds->readUInt16(*p);
				1921	ds->swapArray16(ds, p, 2, q, pErrorCode);
				1922	++p;
				1923	++q;
				1924
				1925	/* read the first 512 tokens and make the token maps */
				1926	if(tokenCount<=512) {
				1927	count=tokenCount;
				1928	} else {
				1929	count=512;
				1930	}
				1931	for(i=0; i<count; ++i) {
				1932	tokens[i]=udata_readInt16(ds, p[i]);
				1933	}
				1934	for(; i<512; ++i) {
				1935	tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
				1936	}
				1937	makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
				1938	makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
				1939	if(U_FAILURE(*pErrorCode)) {
				1940	return 0;
				1941	}
				1942
				1943	/*
				1944	* swap and permutate the tokens
				1945	* go through a temporary array to support in-place swapping
				1946	*/
				1947	temp=(uint16_t )uprv_malloc(tokenCount2);
				1948	if(temp==NULL) {
				1949	udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
				1950	tokenCount);
				1951	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
				1952	return 0;
				1953	}
				1954
				1955	/* swap and permutate single-/lead-byte tokens */
				1956	for(i=0; i<tokenCount && i<256; ++i) {
				1957	ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
				1958	}
				1959
				1960	/* swap and permutate trail-byte tokens */
				1961	for(; i<tokenCount; ++i) {
				1962	ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
				1963	}
				1964
				1965	/* copy the result into the output and free the temporary array */
				1966	uprv_memcpy(q, temp, tokenCount*2);
				1967	uprv_free(temp);
				1968
				1969	/*
				1970	* swap the token strings but not a possible padding byte after
				1971	* the terminating NUL of the last string
				1972	*/
				1973	udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
				1974	outBytes+tokenStringOffset, pErrorCode);
				1975	if(U_FAILURE(*pErrorCode)) {
				1976	udata_printError(ds, "uchar_swapNames(token strings) failed\n");
				1977	return 0;
				1978	}
				1979
				1980	/* swap the group table */
				1981	count=ds->readUInt16(((const uint16_t )(inBytes+groupsOffset)));
				1982	ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count3)2),
				1983	outBytes+groupsOffset, pErrorCode);
				1984
				1985	/*
				1986	* swap the group strings
				1987	* swap the string bytes but not the nibble-encoded string lengths
				1988	*/
				1989	if(ds->inCharset!=ds->outCharset) {
				1990	uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
				1991
				1992	const uint8_t inStrings, nextInStrings;
				1993	uint8_t *outStrings;
				1994
				1995	uint8_t c;
				1996
				1997	inStrings=inBytes+groupStringOffset;
				1998	outStrings=outBytes+groupStringOffset;
				1999
				2000	stringsCount=algNamesOffset-groupStringOffset;
				2001
				2002	/* iterate through string groups until only a few padding bytes are left */
				2003	while(stringsCount>32) {
				2004	nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
				2005
				2006	/* move past the length bytes */
				2007	stringsCount-=(uint32_t)(nextInStrings-inStrings);
				2008	outStrings+=nextInStrings-inStrings;
				2009	inStrings=nextInStrings;
				2010
				2011	count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
				2012	stringsCount-=count;
				2013
				2014	/* swap the string bytes using map[] and trailMap[] */
				2015	while(count>0) {
				2016	c=*inStrings++;
				2017	*outStrings++=map[c];
				2018	if(tokens[c]!=-2) {
				2019	--count;
				2020	} else {
				2021	/* token lead byte: swap the trail byte, too */
				2022	outStrings++=trailMap[inStrings++];
				2023	count-=2;
				2024	}
				2025	}
				2026	}
				2027	}
				2028
				2029	/* swap the algorithmic ranges */
				2030	offset=algNamesOffset;
				2031	count=ds->readUInt32(((const uint32_t )(inBytes+offset)));
				2032	ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
				2033	offset+=4;
				2034
				2035	for(i=0; i<count; ++i) {
				2036	if(offset>(uint32_t)length) {
				2037	udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
				2038	length, i);
				2039	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				2040	return 0;
				2041	}
				2042
				2043	inRange=(const AlgorithmicRange *)(inBytes+offset);
				2044	outRange=(AlgorithmicRange *)(outBytes+offset);
				2045	offset+=ds->readUInt16(inRange->size);
				2046
				2047	ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
				2048	ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
				2049	switch(inRange->type) {
				2050	case 0:
				2051	/* swap prefix string */
				2052	ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
				2053	outRange+1, pErrorCode);
				2054	if(U_FAILURE(*pErrorCode)) {
				2055	udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
				2056	i);
				2057	return 0;
				2058	}
				2059	break;
				2060	case 1:
				2061	{
				2062	/* swap factors and the prefix and factor strings */
				2063	uint32_t factorsCount;
				2064
				2065	factorsCount=inRange->variant;
				2066	p=(const uint16_t *)(inRange+1);
				2067	q=(uint16_t *)(outRange+1);
				2068	ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
				2069
				2070	/* swap the strings, up to the last terminating NUL */
				2071	p+=factorsCount;
				2072	q+=factorsCount;
				2073	stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
				2074	while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
				2075	--stringsCount;
				2076	}
				2077	ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
				2078	}
				2079	break;
				2080	default:
				2081	udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
				2082	inRange->type, i);
				2083	*pErrorCode=U_UNSUPPORTED_ERROR;
				2084	return 0;
				2085	}
				2086	}
				2087	}
				2088
				2089	return headerSize+(int32_t)offset;
				2090	}
				2091
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2092	/*
				2093	* Hey, Emacs, please set the following:
				2094	*
				2095	* Local Variables:
				2096	* indent-tabs-mode: nil
				2097	* End:
				2098	*
				2099	*/