Blame - source/common/unames.cpp - chromium.googlesource.com/chromium/deps/icu

blob: b0ac991e1baeac7f73c0e075c1bbe4e902cbe4cf [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	******************************************************************************
				5	*
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	6	* Copyright (C) 1999-2014, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	7	* Corporation and others. All Rights Reserved.
				8	*
				9	******************************************************************************
				10	* file name: unames.c
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	11	* encoding: UTF-8
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	12	* tab size: 8 (not used)
				13	* indentation:4
				14	*
				15	* created on: 1999oct04
				16	* created by: Markus W. Scherer
				17	*/
				18
				19	#include "unicode/utypes.h"
				20	#include "unicode/putil.h"
				21	#include "unicode/uchar.h"
				22	#include "unicode/udata.h"
				23	#include "unicode/utf.h"
				24	#include "unicode/utf16.h"
				25	#include "uassert.h"
				26	#include "ustr_imp.h"
				27	#include "umutex.h"
				28	#include "cmemory.h"
				29	#include "cstring.h"
				30	#include "ucln_cmn.h"
				31	#include "udataswp.h"
				32	#include "uprops.h"
				33
				34	U_NAMESPACE_BEGIN
				35
				36	/* prototypes ------------------------------------------------------------- */
				37
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	38	static const char DATA_NAME[] = "unames";
				39	static const char DATA_TYPE[] = "icu";
				40
				41	#define GROUP_SHIFT 5
				42	#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
				43	#define GROUP_MASK (LINES_PER_GROUP-1)
				44
				45	/*
				46	* This struct was replaced by explicitly accessing equivalent
				47	* fields from triples of uint16_t.
				48	* The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
				49	* which broke the assumption that sizeof(Group)==6 and that the ++ operator
				50	* would advance by 6 bytes (3 uint16_t).
				51	*
				52	* We can't just change the data structure because it's loaded from a data file,
				53	* and we don't want to make it less compact, so we changed the access code.
				54	*
				55	* For details see ICU tickets 6331 and 6008.
				56	typedef struct {
				57	uint16_t groupMSB,
				58	offsetHigh, offsetLow; / * avoid padding * /
				59	} Group;
				60	*/
				61	enum {
				62	GROUP_MSB,
				63	GROUP_OFFSET_HIGH,
				64	GROUP_OFFSET_LOW,
				65	GROUP_LENGTH
				66	};
				67
				68	/*
				69	* Get the 32-bit group offset.
				70	* @param group (const uint16_t *) pointer to a Group triple of uint16_t
				71	* @return group offset (int32_t)
				72	*/
				73	#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16\|(group)[GROUP_OFFSET_LOW])
				74
				75	#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
				76	#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
				77
				78	typedef struct {
				79	uint32_t start, end;
				80	uint8_t type, variant;
				81	uint16_t size;
				82	} AlgorithmicRange;
				83
				84	typedef struct {
				85	uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
				86	} UCharNames;
				87
				88	/*
				89	* Get the groups table from a UCharNames struct.
				90	* The groups table consists of one uint16_t groupCount followed by
				91	* groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
				92	* and the comment for the old struct Group above.
				93	*
				94	* @param names (const UCharNames *) pointer to the UCharNames indexes
				95	* @return (const uint16_t *) pointer to the groups table
				96	*/
				97	#define GET_GROUPS(names) (const uint16_t )((const char )names+names->groupsOffset)
				98
				99	typedef struct {
				100	const char *otherName;
				101	UChar32 code;
				102	} FindName;
				103
				104	#define DO_FIND_NAME NULL
				105
				106	static UDataMemory *uCharNamesData=NULL;
				107	static UCharNames *uCharNames=NULL;
Frank Tang	1c67b4e	2022-05-18 10:13:51 -0700	[diff] [blame]	108	static icu::UInitOnce gCharNamesInitOnce {};
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	109
				110	/*
				111	* Maximum length of character names (regular & 1.0).
				112	*/
				113	static int32_t gMaxNameLength=0;
				114
				115	/*
				116	* Set of chars used in character names (regular & 1.0).
				117	* Chars are platform-dependent (can be EBCDIC).
				118	*/
				119	static uint32_t gNameSet[8]={ 0 };
				120
				121	#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
				122	#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
				123	#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
				124
				125	#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
				126
				127	static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
				128	"unassigned",
				129	"uppercase letter",
				130	"lowercase letter",
				131	"titlecase letter",
				132	"modifier letter",
				133	"other letter",
				134	"non spacing mark",
				135	"enclosing mark",
				136	"combining spacing mark",
				137	"decimal digit number",
				138	"letter number",
				139	"other number",
				140	"space separator",
				141	"line separator",
				142	"paragraph separator",
				143	"control",
				144	"format",
				145	"private use area",
				146	"surrogate",
				147	"dash punctuation",
				148	"start punctuation",
				149	"end punctuation",
				150	"connector punctuation",
				151	"other punctuation",
				152	"math symbol",
				153	"currency symbol",
				154	"modifier symbol",
				155	"other symbol",
				156	"initial punctuation",
				157	"final punctuation",
				158	"noncharacter",
				159	"lead surrogate",
				160	"trail surrogate"
				161	};
				162
				163	/* implementation ----------------------------------------------------------- */
				164
				165	static UBool U_CALLCONV unames_cleanup(void)
				166	{
				167	if(uCharNamesData) {
				168	udata_close(uCharNamesData);
				169	uCharNamesData = NULL;
				170	}
				171	if(uCharNames) {
				172	uCharNames = NULL;
				173	}
				174	gCharNamesInitOnce.reset();
				175	gMaxNameLength=0;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	176	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	177	}
				178
				179	static UBool U_CALLCONV
				180	isAcceptable(void * /context/,
				181	const char * /type/, const char * /name/,
				182	const UDataInfo *pInfo) {
				183	return (UBool)(
				184	pInfo->size>=20 &&
				185	pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
				186	pInfo->charsetFamily==U_CHARSET_FAMILY &&
				187	pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
				188	pInfo->dataFormat[1]==0x6e &&
				189	pInfo->dataFormat[2]==0x61 &&
				190	pInfo->dataFormat[3]==0x6d &&
				191	pInfo->formatVersion[0]==1);
				192	}
				193
				194	static void U_CALLCONV
				195	loadCharNames(UErrorCode &status) {
				196	U_ASSERT(uCharNamesData == NULL);
				197	U_ASSERT(uCharNames == NULL);
				198
				199	uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
				200	if(U_FAILURE(status)) {
				201	uCharNamesData = NULL;
				202	} else {
				203	uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
				204	}
				205	ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
				206	}
				207
				208
				209	static UBool
				210	isDataLoaded(UErrorCode *pErrorCode) {
				211	umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
				212	return U_SUCCESS(*pErrorCode);
				213	}
				214
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	215	#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) UPRV_BLOCK_MACRO_BEGIN { \
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	216	if((bufferLength)>0) { \
				217	*(buffer)++=c; \
				218	--(bufferLength); \
				219	} \
				220	++(bufferPos); \
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	221	} UPRV_BLOCK_MACRO_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	222
				223	#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
				224
				225	/*
				226	* Important: expandName() and compareName() are almost the same -
				227	* apply fixes to both.
				228	*
				229	* UnicodeData.txt uses ';' as a field separator, so no
				230	* field can contain ';' as part of its contents.
				231	* In unames.dat, it is marked as token[';']==-1 only if the
				232	* semicolon is used in the data file - which is iff we
				233	* have Unicode 1.0 names or ISO comments or aliases.
				234	* So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
				235	* although we know that it will never be part of a name.
				236	*/
				237	static uint16_t
				238	expandName(UCharNames *names,
				239	const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
				240	char *buffer, uint16_t bufferLength) {
				241	uint16_t tokens=(uint16_t )names+8;
				242	uint16_t token, tokenCount=*tokens++, bufferPos=0;
				243	uint8_t tokenStrings=(uint8_t )names+names->tokenStringOffset;
				244	uint8_t c;
				245
				246	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				247	/*
				248	* skip the modern name if it is not requested _and_
				249	* if the semicolon byte value is a character, not a token number
				250	*/
				251	if((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				252	int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
				253	do {
				254	while(nameLength>0) {
				255	--nameLength;
				256	if(*name++==';') {
				257	break;
				258	}
				259	}
				260	} while(--fieldIndex>0);
				261	} else {
				262	/*
				263	* the semicolon byte value is a token number, therefore
				264	* only modern names are stored in unames.dat and there is no
				265	* such requested alternate name here
				266	*/
				267	nameLength=0;
				268	}
				269	}
				270
				271	/* write each letter directly, and write a token word per token */
				272	while(nameLength>0) {
				273	--nameLength;
				274	c=*name++;
				275
				276	if(c>=tokenCount) {
				277	if(c!=';') {
				278	/* implicit letter */
				279	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				280	} else {
				281	/* finished */
				282	break;
				283	}
				284	} else {
				285	token=tokens[c];
				286	if(token==(uint16_t)(-2)) {
				287	/* this is a lead byte for a double-byte token */
				288	token=tokens[c<<8\|*name++];
				289	--nameLength;
				290	}
				291	if(token==(uint16_t)(-1)) {
				292	if(c!=';') {
				293	/* explicit letter */
				294	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				295	} else {
				296	/* stop, but skip the semicolon if we are seeking
				297	extended names and there was no 2.0 name but there
				298	is a 1.0 name. */
				299	if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
				300	if ((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				301	continue;
				302	}
				303	}
				304	/* finished */
				305	break;
				306	}
				307	} else {
				308	/* write token word */
				309	uint8_t *tokenString=tokenStrings+token;
				310	while((c=*tokenString++)!=0) {
				311	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				312	}
				313	}
				314	}
				315	}
				316
				317	/* zero-terminate */
				318	if(bufferLength>0) {
				319	*buffer=0;
				320	}
				321
				322	return bufferPos;
				323	}
				324
				325	/*
				326	* compareName() is almost the same as expandName() except that it compares
				327	* the currently expanded name to an input name.
				328	* It returns the match/no match result as soon as possible.
				329	*/
				330	static UBool
				331	compareName(UCharNames *names,
				332	const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
				333	const char *otherName) {
				334	uint16_t tokens=(uint16_t )names+8;
				335	uint16_t token, tokenCount=*tokens++;
				336	uint8_t tokenStrings=(uint8_t )names+names->tokenStringOffset;
				337	uint8_t c;
				338	const char *origOtherName = otherName;
				339
				340	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				341	/*
				342	* skip the modern name if it is not requested _and_
				343	* if the semicolon byte value is a character, not a token number
				344	*/
				345	if((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				346	int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
				347	do {
				348	while(nameLength>0) {
				349	--nameLength;
				350	if(*name++==';') {
				351	break;
				352	}
				353	}
				354	} while(--fieldIndex>0);
				355	} else {
				356	/*
				357	* the semicolon byte value is a token number, therefore
				358	* only modern names are stored in unames.dat and there is no
				359	* such requested alternate name here
				360	*/
				361	nameLength=0;
				362	}
				363	}
				364
				365	/* compare each letter directly, and compare a token word per token */
				366	while(nameLength>0) {
				367	--nameLength;
				368	c=*name++;
				369
				370	if(c>=tokenCount) {
				371	if(c!=';') {
				372	/* implicit letter */
				373	if((char)c!=*otherName++) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	374	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	375	}
				376	} else {
				377	/* finished */
				378	break;
				379	}
				380	} else {
				381	token=tokens[c];
				382	if(token==(uint16_t)(-2)) {
				383	/* this is a lead byte for a double-byte token */
				384	token=tokens[c<<8\|*name++];
				385	--nameLength;
				386	}
				387	if(token==(uint16_t)(-1)) {
				388	if(c!=';') {
				389	/* explicit letter */
				390	if((char)c!=*otherName++) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	391	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	392	}
				393	} else {
				394	/* stop, but skip the semicolon if we are seeking
				395	extended names and there was no 2.0 name but there
				396	is a 1.0 name. */
				397	if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
				398	if ((uint8_t)';'>=tokenCount \|\| tokens[(uint8_t)';']==(uint16_t)(-1)) {
				399	continue;
				400	}
				401	}
				402	/* finished */
				403	break;
				404	}
				405	} else {
				406	/* write token word */
				407	uint8_t *tokenString=tokenStrings+token;
				408	while((c=*tokenString++)!=0) {
				409	if((char)c!=*otherName++) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	410	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	411	}
				412	}
				413	}
				414	}
				415	}
				416
				417	/* complete match? */
				418	return (UBool)(*otherName==0);
				419	}
				420
				421	static uint8_t getCharCat(UChar32 cp) {
				422	uint8_t cat;
				423
				424	if (U_IS_UNICODE_NONCHAR(cp)) {
				425	return U_NONCHARACTER_CODE_POINT;
				426	}
				427
				428	if ((cat = u_charType(cp)) == U_SURROGATE) {
				429	cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
				430	}
				431
				432	return cat;
				433	}
				434
				435	static const char *getCharCatName(UChar32 cp) {
				436	uint8_t cat = getCharCat(cp);
				437
				438	/* Return unknown if the table of names above is not up to
				439	date. */
				440
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	441	if (cat >= UPRV_LENGTHOF(charCatNames)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	442	return "unknown";
				443	} else {
				444	return charCatNames[cat];
				445	}
				446	}
				447
				448	static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
				449	const char *catname = getCharCatName(code);
				450	uint16_t length = 0;
				451
				452	UChar32 cp;
				453	int ndigits, i;
				454
				455	WRITE_CHAR(buffer, bufferLength, length, '<');
				456	while (catname[length - 1]) {
				457	WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
				458	}
				459	WRITE_CHAR(buffer, bufferLength, length, '-');
				460	for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
				461	;
				462	if (ndigits < 4)
				463	ndigits = 4;
				464	for (cp = code, i = ndigits; (cp \|\| i > 0) && bufferLength; cp >>= 4, bufferLength--) {
				465	uint8_t v = (uint8_t)(cp & 0xf);
				466	buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
				467	}
				468	buffer += ndigits;
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	469	length += static_cast<uint16_t>(ndigits);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	470	WRITE_CHAR(buffer, bufferLength, length, '>');
				471
				472	return length;
				473	}
				474
				475	/*
				476	* getGroup() does a binary search for the group that contains the
				477	* Unicode code point "code".
				478	* The return value is always a valid Group* that may contain "code"
				479	* or else is the highest group before "code".
				480	* If the lowest group is after "code", then that one is returned.
				481	*/
				482	static const uint16_t *
				483	getGroup(UCharNames *names, uint32_t code) {
				484	const uint16_t *groups=GET_GROUPS(names);
				485	uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
				486	start=0,
				487	limit=*groups++,
				488	number;
				489
				490	/* binary search for the group of names that contains the one for code */
				491	while(start<limit-1) {
				492	number=(uint16_t)((start+limit)/2);
				493	if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
				494	limit=number;
				495	} else {
				496	start=number;
				497	}
				498	}
				499
				500	/* return this regardless of whether it is an exact match */
				501	return groups+start*GROUP_LENGTH;
				502	}
				503
				504	/*
				505	* expandGroupLengths() reads a block of compressed lengths of 32 strings and
				506	* expands them into offsets and lengths for each string.
				507	* Lengths are stored with a variable-width encoding in consecutive nibbles:
				508	* If a nibble<0xc, then it is the length itself (0=empty string).
				509	* If a nibble>=0xc, then it forms a length value with the following nibble.
				510	* Calculation see below.
				511	* The offsets and lengths arrays must be at least 33 (one more) long because
				512	* there is no check here at the end if the last nibble is still used.
				513	*/
				514	static const uint8_t *
				515	expandGroupLengths(const uint8_t *s,
				516	uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
				517	/* read the lengths of the 32 strings in this group and get each string's offset */
				518	uint16_t i=0, offset=0, length=0;
				519	uint8_t lengthByte;
				520
				521	/* all 32 lengths must be read to get the offset of the first group string */
				522	while(i<LINES_PER_GROUP) {
				523	lengthByte=*s++;
				524
				525	/* read even nibble - MSBs of lengthByte */
				526	if(length>=12) {
				527	/* double-nibble length spread across two bytes */
				528	length=(uint16_t)(((length&0x3)<<4\|lengthByte>>4)+12);
				529	lengthByte&=0xf;
				530	} else if((lengthByte /* &0xf0 */)>=0xc0) {
				531	/* double-nibble length spread across this one byte */
				532	length=(uint16_t)((lengthByte&0x3f)+12);
				533	} else {
				534	/* single-nibble length in MSBs */
				535	length=(uint16_t)(lengthByte>>4);
				536	lengthByte&=0xf;
				537	}
				538
				539	*offsets++=offset;
				540	*lengths++=length;
				541
				542	offset+=length;
				543	++i;
				544
				545	/* read odd nibble - LSBs of lengthByte */
				546	if((lengthByte&0xf0)==0) {
				547	/* this nibble was not consumed for a double-nibble length above */
				548	length=lengthByte;
				549	if(length<12) {
				550	/* single-nibble length in LSBs */
				551	*offsets++=offset;
				552	*lengths++=length;
				553
				554	offset+=length;
				555	++i;
				556	}
				557	} else {
				558	length=0; /* prevent double-nibble detection in the next iteration */
				559	}
				560	}
				561
				562	/* now, s is at the first group string */
				563	return s;
				564	}
				565
				566	static uint16_t
				567	expandGroupName(UCharNames names, const uint16_t group,
				568	uint16_t lineNumber, UCharNameChoice nameChoice,
				569	char *buffer, uint16_t bufferLength) {
				570	uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
				571	const uint8_t s=(uint8_t )names+names->groupStringOffset+GET_GROUP_OFFSET(group);
				572	s=expandGroupLengths(s, offsets, lengths);
				573	return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
				574	buffer, bufferLength);
				575	}
				576
				577	static uint16_t
				578	getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
				579	char *buffer, uint16_t bufferLength) {
				580	const uint16_t *group=getGroup(names, code);
				581	if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
				582	return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
				583	buffer, bufferLength);
				584	} else {
				585	/* group not found */
				586	/* zero-terminate */
				587	if(bufferLength>0) {
				588	*buffer=0;
				589	}
				590	return 0;
				591	}
				592	}
				593
				594	/*
				595	* enumGroupNames() enumerates all the names in a 32-group
				596	* and either calls the enumerator function or finds a given input name.
				597	*/
				598	static UBool
				599	enumGroupNames(UCharNames names, const uint16_t group,
				600	UChar32 start, UChar32 end,
				601	UEnumCharNamesFn fn, void context,
				602	UCharNameChoice nameChoice) {
				603	uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
				604	const uint8_t s=(uint8_t )names+names->groupStringOffset+GET_GROUP_OFFSET(group);
				605
				606	s=expandGroupLengths(s, offsets, lengths);
				607	if(fn!=DO_FIND_NAME) {
				608	char buffer[200];
				609	uint16_t length;
				610
				611	while(start<=end) {
				612	length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
				613	if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
				614	buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
				615	}
				616	/* here, we assume that the buffer is large enough */
				617	if(length>0) {
				618	if(!fn(context, start, nameChoice, buffer, length)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	619	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	620	}
				621	}
				622	++start;
				623	}
				624	} else {
				625	const char otherName=((FindName )context)->otherName;
				626	while(start<=end) {
				627	if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
				628	((FindName *)context)->code=start;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	629	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	630	}
				631	++start;
				632	}
				633	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	634	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	635	}
				636
				637	/*
				638	* enumExtNames enumerate extended names.
				639	* It only needs to do it if it is called with a real function and not
				640	* with the dummy DO_FIND_NAME, because u_charFromName() does a check
				641	* for extended names by itself.
				642	*/
				643	static UBool
				644	enumExtNames(UChar32 start, UChar32 end,
				645	UEnumCharNamesFn fn, void context)
				646	{
				647	if(fn!=DO_FIND_NAME) {
				648	char buffer[200];
				649	uint16_t length;
				650
				651	while(start<=end) {
				652	buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
				653	/* here, we assume that the buffer is large enough */
				654	if(length>0) {
				655	if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	656	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	657	}
				658	}
				659	++start;
				660	}
				661	}
				662
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	663	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	664	}
				665
				666	static UBool
				667	enumNames(UCharNames *names,
				668	UChar32 start, UChar32 limit,
				669	UEnumCharNamesFn fn, void context,
				670	UCharNameChoice nameChoice) {
				671	uint16_t startGroupMSB, endGroupMSB, groupCount;
				672	const uint16_t group, groupLimit;
				673
				674	startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
				675	endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
				676
				677	/* find the group that contains start, or the highest before it */
				678	group=getGroup(names, start);
				679
				680	if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
				681	/* enumerate synthetic names between start and the group start */
				682	UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
				683	if(extLimit>limit) {
				684	extLimit=limit;
				685	}
				686	if(!enumExtNames(start, extLimit-1, fn, context)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	687	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	688	}
				689	start=extLimit;
				690	}
				691
				692	if(startGroupMSB==endGroupMSB) {
				693	if(startGroupMSB==group[GROUP_MSB]) {
				694	/* if start and limit-1 are in the same group, then enumerate only in that one */
				695	return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
				696	}
				697	} else {
				698	const uint16_t *groups=GET_GROUPS(names);
				699	groupCount=*groups++;
				700	groupLimit=groups+groupCount*GROUP_LENGTH;
				701
				702	if(startGroupMSB==group[GROUP_MSB]) {
				703	/* enumerate characters in the partial start group */
				704	if((start&GROUP_MASK)!=0) {
				705	if(!enumGroupNames(names, group,
				706	start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
				707	fn, context, nameChoice)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	708	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	709	}
				710	group=NEXT_GROUP(group); /* continue with the next group */
				711	}
				712	} else if(startGroupMSB>group[GROUP_MSB]) {
				713	/* make sure that we start enumerating with the first group after start */
				714	const uint16_t *nextGroup=NEXT_GROUP(group);
				715	if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
				716	UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
				717	if (end > limit) {
				718	end = limit;
				719	}
				720	if (!enumExtNames(start, end - 1, fn, context)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	721	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	722	}
				723	}
				724	group=nextGroup;
				725	}
				726
				727	/* enumerate entire groups between the start- and end-groups */
				728	while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
				729	const uint16_t *nextGroup;
				730	start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
				731	if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	732	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	733	}
				734	nextGroup=NEXT_GROUP(group);
				735	if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
				736	UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
				737	if (end > limit) {
				738	end = limit;
				739	}
				740	if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	741	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	742	}
				743	}
				744	group=nextGroup;
				745	}
				746
				747	/* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
				748	if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
				749	return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
				750	} else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
				751	UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
				752	if (next > start) {
				753	start = next;
				754	}
				755	} else {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	756	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	757	}
				758	}
				759
				760	/* we have not found a group, which means everything is made of
				761	extended names. */
				762	if (nameChoice == U_EXTENDED_CHAR_NAME) {
				763	if (limit > UCHAR_MAX_VALUE + 1) {
				764	limit = UCHAR_MAX_VALUE + 1;
				765	}
				766	return enumExtNames(start, limit - 1, fn, context);
				767	}
				768
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	769	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	770	}
				771
				772	static uint16_t
				773	writeFactorSuffix(const uint16_t *factors, uint16_t count,
				774	const char s, / suffix elements */
				775	uint32_t code,
				776	uint16_t indexes[8], /* output fields from here */
				777	const char elementBases[8], const char elements[8],
				778	char *buffer, uint16_t bufferLength) {
				779	uint16_t i, factor, bufferPos=0;
				780	char c;
				781
				782	/* write elements according to the factors */
				783
				784	/*
				785	* the factorized elements are determined by modulo arithmetic
				786	* with the factors of this algorithm
				787	*
				788	* note that for fewer operations, count is decremented here
				789	*/
				790	--count;
				791	for(i=count; i>0; --i) {
				792	factor=factors[i];
				793	indexes[i]=(uint16_t)(code%factor);
				794	code/=factor;
				795	}
				796	/*
				797	* we don't need to calculate the last modulus because start<=code<=end
				798	* guarantees here that code<=factors[0]
				799	*/
				800	indexes[0]=(uint16_t)code;
				801
				802	/* write each element */
				803	for(;;) {
				804	if(elementBases!=NULL) {
				805	*elementBases++=s;
				806	}
				807
				808	/* skip indexes[i] strings */
				809	factor=indexes[i];
				810	while(factor>0) {
				811	while(*s++!=0) {}
				812	--factor;
				813	}
				814	if(elements!=NULL) {
				815	*elements++=s;
				816	}
				817
				818	/* write element */
				819	while((c=*s++)!=0) {
				820	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				821	}
				822
				823	/* we do not need to perform the rest of this loop for i==count - break here */
				824	if(i>=count) {
				825	break;
				826	}
				827
				828	/* skip the rest of the strings for this factors[i] */
				829	factor=(uint16_t)(factors[i]-indexes[i]-1);
				830	while(factor>0) {
				831	while(*s++!=0) {}
				832	--factor;
				833	}
				834
				835	++i;
				836	}
				837
				838	/* zero-terminate */
				839	if(bufferLength>0) {
				840	*buffer=0;
				841	}
				842
				843	return bufferPos;
				844	}
				845
				846	/*
				847	* Important:
				848	* Parts of findAlgName() are almost the same as some of getAlgName().
				849	* Fixes must be applied to both.
				850	*/
				851	static uint16_t
				852	getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
				853	char *buffer, uint16_t bufferLength) {
				854	uint16_t bufferPos=0;
				855
				856	/* Only the normative character name can be algorithmic. */
				857	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				858	/* zero-terminate */
				859	if(bufferLength>0) {
				860	*buffer=0;
				861	}
				862	return 0;
				863	}
				864
				865	switch(range->type) {
				866	case 0: {
				867	/* name = prefix hex-digits */
				868	const char s=(const char )(range+1);
				869	char c;
				870
				871	uint16_t i, count;
				872
				873	/* copy prefix */
				874	while((c=*s++)!=0) {
				875	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				876	}
				877
				878	/* write hexadecimal code point value */
				879	count=range->variant;
				880
				881	/* zero-terminate */
				882	if(count<bufferLength) {
				883	buffer[count]=0;
				884	}
				885
				886	for(i=count; i>0;) {
				887	if(--i<bufferLength) {
				888	c=(char)(code&0xf);
				889	if(c<10) {
				890	c+='0';
				891	} else {
				892	c+='A'-10;
				893	}
				894	buffer[i]=c;
				895	}
				896	code>>=4;
				897	}
				898
				899	bufferPos+=count;
				900	break;
				901	}
				902	case 1: {
				903	/* name = prefix factorized-elements */
				904	uint16_t indexes[8];
				905	const uint16_t factors=(const uint16_t )(range+1);
				906	uint16_t count=range->variant;
				907	const char s=(const char )(factors+count);
				908	char c;
				909
				910	/* copy prefix */
				911	while((c=*s++)!=0) {
				912	WRITE_CHAR(buffer, bufferLength, bufferPos, c);
				913	}
				914
				915	bufferPos+=writeFactorSuffix(factors, count,
				916	s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
				917	break;
				918	}
				919	default:
				920	/* undefined type */
				921	/* zero-terminate */
				922	if(bufferLength>0) {
				923	*buffer=0;
				924	}
				925	break;
				926	}
				927
				928	return bufferPos;
				929	}
				930
				931	/*
				932	* Important: enumAlgNames() and findAlgName() are almost the same.
				933	* Any fix must be applied to both.
				934	*/
				935	static UBool
				936	enumAlgNames(AlgorithmicRange *range,
				937	UChar32 start, UChar32 limit,
				938	UEnumCharNamesFn fn, void context,
				939	UCharNameChoice nameChoice) {
				940	char buffer[200];
				941	uint16_t length;
				942
				943	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	944	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	945	}
				946
				947	switch(range->type) {
				948	case 0: {
				949	char s, end;
				950	char c;
				951
				952	/* get the full name of the start character */
				953	length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
				954	if(length<=0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	955	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	956	}
				957
				958	/* call the enumerator function with this first character */
				959	if(!fn(context, start, nameChoice, buffer, length)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	960	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	961	}
				962
				963	/* go to the end of the name; all these names have the same length */
				964	end=buffer;
				965	while(*end!=0) {
				966	++end;
				967	}
				968
				969	/* enumerate the rest of the names */
				970	while(++start<limit) {
				971	/* increment the hexadecimal number on a character-basis */
				972	s=end;
				973	for (;;) {
				974	c=*--s;
				975	if(('0'<=c && c<'9') \|\| ('A'<=c && c<'F')) {
				976	*s=(char)(c+1);
				977	break;
				978	} else if(c=='9') {
				979	*s='A';
				980	break;
				981	} else if(c=='F') {
				982	*s='0';
				983	}
				984	}
				985
				986	if(!fn(context, start, nameChoice, buffer, length)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	987	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	988	}
				989	}
				990	break;
				991	}
				992	case 1: {
				993	uint16_t indexes[8];
				994	const char elementBases[8], elements[8];
				995	const uint16_t factors=(const uint16_t )(range+1);
				996	uint16_t count=range->variant;
				997	const char s=(const char )(factors+count);
				998	char suffix, t;
				999	uint16_t prefixLength, i, idx;
				1000
				1001	char c;
				1002
				1003	/* name = prefix factorized-elements */
				1004
				1005	/* copy prefix */
				1006	suffix=buffer;
				1007	prefixLength=0;
				1008	while((c=*s++)!=0) {
				1009	*suffix++=c;
				1010	++prefixLength;
				1011	}
				1012
				1013	/* append the suffix of the start character */
				1014	length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
				1015	s, (uint32_t)start-range->start,
				1016	indexes, elementBases, elements,
				1017	suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
				1018
				1019	/* call the enumerator function with this first character */
				1020	if(!fn(context, start, nameChoice, buffer, length)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1021	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1022	}
				1023
				1024	/* enumerate the rest of the names */
				1025	while(++start<limit) {
				1026	/* increment the indexes in lexical order bound by the factors */
				1027	i=count;
				1028	for (;;) {
				1029	idx=(uint16_t)(indexes[--i]+1);
				1030	if(idx<factors[i]) {
				1031	/* skip one index and its element string */
				1032	indexes[i]=idx;
				1033	s=elements[i];
				1034	while(*s++!=0) {
				1035	}
				1036	elements[i]=s;
				1037	break;
				1038	} else {
				1039	/* reset this index to 0 and its element string to the first one */
				1040	indexes[i]=0;
				1041	elements[i]=elementBases[i];
				1042	}
				1043	}
				1044
				1045	/* to make matters a little easier, just append all elements to the suffix */
				1046	t=suffix;
				1047	length=prefixLength;
				1048	for(i=0; i<count; ++i) {
				1049	s=elements[i];
				1050	while((c=*s++)!=0) {
				1051	*t++=c;
				1052	++length;
				1053	}
				1054	}
				1055	/* zero-terminate */
				1056	*t=0;
				1057
				1058	if(!fn(context, start, nameChoice, buffer, length)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1059	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1060	}
				1061	}
				1062	break;
				1063	}
				1064	default:
				1065	/* undefined type */
				1066	break;
				1067	}
				1068
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1069	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1070	}
				1071
				1072	/*
				1073	* findAlgName() is almost the same as enumAlgNames() except that it
				1074	* returns the code point for a name if it fits into the range.
				1075	* It returns 0xffff otherwise.
				1076	*/
				1077	static UChar32
				1078	findAlgName(AlgorithmicRange range, UCharNameChoice nameChoice, const char otherName) {
				1079	UChar32 code;
				1080
				1081	if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
				1082	return 0xffff;
				1083	}
				1084
				1085	switch(range->type) {
				1086	case 0: {
				1087	/* name = prefix hex-digits */
				1088	const char s=(const char )(range+1);
				1089	char c;
				1090
				1091	uint16_t i, count;
				1092
				1093	/* compare prefix */
				1094	while((c=*s++)!=0) {
				1095	if((char)c!=*otherName++) {
				1096	return 0xffff;
				1097	}
				1098	}
				1099
				1100	/* read hexadecimal code point value */
				1101	count=range->variant;
				1102	code=0;
				1103	for(i=0; i<count; ++i) {
				1104	c=*otherName++;
				1105	if('0'<=c && c<='9') {
				1106	code=(code<<4)\|(c-'0');
				1107	} else if('A'<=c && c<='F') {
				1108	code=(code<<4)\|(c-'A'+10);
				1109	} else {
				1110	return 0xffff;
				1111	}
				1112	}
				1113
				1114	/* does it fit into the range? */
				1115	if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
				1116	return code;
				1117	}
				1118	break;
				1119	}
				1120	case 1: {
				1121	char buffer[64];
				1122	uint16_t indexes[8];
				1123	const char elementBases[8], elements[8];
				1124	const uint16_t factors=(const uint16_t )(range+1);
				1125	uint16_t count=range->variant;
				1126	const char s=(const char )(factors+count), *t;
				1127	UChar32 start, limit;
				1128	uint16_t i, idx;
				1129
				1130	char c;
				1131
				1132	/* name = prefix factorized-elements */
				1133
				1134	/* compare prefix */
				1135	while((c=*s++)!=0) {
				1136	if((char)c!=*otherName++) {
				1137	return 0xffff;
				1138	}
				1139	}
				1140
				1141	start=(UChar32)range->start;
				1142	limit=(UChar32)(range->end+1);
				1143
				1144	/* initialize the suffix elements for enumeration; indexes should all be set to 0 */
				1145	writeFactorSuffix(factors, count, s, 0,
				1146	indexes, elementBases, elements, buffer, sizeof(buffer));
				1147
				1148	/* compare the first suffix */
				1149	if(0==uprv_strcmp(otherName, buffer)) {
				1150	return start;
				1151	}
				1152
				1153	/* enumerate and compare the rest of the suffixes */
				1154	while(++start<limit) {
				1155	/* increment the indexes in lexical order bound by the factors */
				1156	i=count;
				1157	for (;;) {
				1158	idx=(uint16_t)(indexes[--i]+1);
				1159	if(idx<factors[i]) {
				1160	/* skip one index and its element string */
				1161	indexes[i]=idx;
				1162	s=elements[i];
				1163	while(*s++!=0) {}
				1164	elements[i]=s;
				1165	break;
				1166	} else {
				1167	/* reset this index to 0 and its element string to the first one */
				1168	indexes[i]=0;
				1169	elements[i]=elementBases[i];
				1170	}
				1171	}
				1172
				1173	/* to make matters a little easier, just compare all elements of the suffix */
				1174	t=otherName;
				1175	for(i=0; i<count; ++i) {
				1176	s=elements[i];
				1177	while((c=*s++)!=0) {
				1178	if(c!=*t++) {
				1179	s=""; /* does not match */
				1180	i=99;
				1181	}
				1182	}
				1183	}
				1184	if(i<99 && *t==0) {
				1185	return start;
				1186	}
				1187	}
				1188	break;
				1189	}
				1190	default:
				1191	/* undefined type */
				1192	break;
				1193	}
				1194
				1195	return 0xffff;
				1196	}
				1197
				1198	/* sets of name characters, maximum name lengths ---------------------------- */
				1199
				1200	#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]\|=((uint32_t)1<<((uint8_t)c&0x1f)))
				1201	#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
				1202
				1203	static int32_t
				1204	calcStringSetLength(uint32_t set[8], const char *s) {
				1205	int32_t length=0;
				1206	char c;
				1207
				1208	while((c=*s++)!=0) {
				1209	SET_ADD(set, c);
				1210	++length;
				1211	}
				1212	return length;
				1213	}
				1214
				1215	static int32_t
				1216	calcAlgNameSetsLengths(int32_t maxNameLength) {
				1217	AlgorithmicRange *range;
				1218	uint32_t *p;
				1219	uint32_t rangeCount;
				1220	int32_t length;
				1221
				1222	/* enumerate algorithmic ranges */
				1223	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1224	rangeCount=*p;
				1225	range=(AlgorithmicRange *)(p+1);
				1226	while(rangeCount>0) {
				1227	switch(range->type) {
				1228	case 0:
				1229	/* name = prefix + (range->variant times) hex-digits */
				1230	/* prefix */
				1231	length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
				1232	if(length>maxNameLength) {
				1233	maxNameLength=length;
				1234	}
				1235	break;
				1236	case 1: {
				1237	/* name = prefix factorized-elements */
				1238	const uint16_t factors=(const uint16_t )(range+1);
				1239	const char *s;
				1240	int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
				1241
				1242	/* prefix length */
				1243	s=(const char *)(factors+count);
				1244	length=calcStringSetLength(gNameSet, s);
				1245	s+=length+1; /* start of factor suffixes */
				1246
				1247	/* get the set and maximum factor suffix length for each factor */
				1248	for(i=0; i<count; ++i) {
				1249	maxFactorLength=0;
				1250	for(factor=factors[i]; factor>0; --factor) {
				1251	factorLength=calcStringSetLength(gNameSet, s);
				1252	s+=factorLength+1;
				1253	if(factorLength>maxFactorLength) {
				1254	maxFactorLength=factorLength;
				1255	}
				1256	}
				1257	length+=maxFactorLength;
				1258	}
				1259
				1260	if(length>maxNameLength) {
				1261	maxNameLength=length;
				1262	}
				1263	break;
				1264	}
				1265	default:
				1266	/* unknown type */
				1267	break;
				1268	}
				1269
				1270	range=(AlgorithmicRange )((uint8_t )range+range->size);
				1271	--rangeCount;
				1272	}
				1273	return maxNameLength;
				1274	}
				1275
				1276	static int32_t
				1277	calcExtNameSetsLengths(int32_t maxNameLength) {
				1278	int32_t i, length;
				1279
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1280	for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1281	/*
				1282	* for each category, count the length of the category name
				1283	* plus 9=
				1284	* 2 for <>
				1285	* 1 for -
				1286	* 6 for most hex digits per code point
				1287	*/
				1288	length=9+calcStringSetLength(gNameSet, charCatNames[i]);
				1289	if(length>maxNameLength) {
				1290	maxNameLength=length;
				1291	}
				1292	}
				1293	return maxNameLength;
				1294	}
				1295
				1296	static int32_t
				1297	calcNameSetLength(const uint16_t tokens, uint16_t tokenCount, const uint8_t tokenStrings, int8_t *tokenLengths,
				1298	uint32_t set[8],
				1299	const uint8_t *pLine, const uint8_t lineLimit) {
				1300	const uint8_t line=pLine;
				1301	int32_t length=0, tokenLength;
				1302	uint16_t c, token;
				1303
				1304	while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
				1305	if(c>=tokenCount) {
				1306	/* implicit letter */
				1307	SET_ADD(set, c);
				1308	++length;
				1309	} else {
				1310	token=tokens[c];
				1311	if(token==(uint16_t)(-2)) {
				1312	/* this is a lead byte for a double-byte token */
				1313	c=c<<8\|*line++;
				1314	token=tokens[c];
				1315	}
				1316	if(token==(uint16_t)(-1)) {
				1317	/* explicit letter */
				1318	SET_ADD(set, c);
				1319	++length;
				1320	} else {
				1321	/* count token word */
				1322	if(tokenLengths!=NULL) {
				1323	/* use cached token length */
				1324	tokenLength=tokenLengths[c];
				1325	if(tokenLength==0) {
				1326	tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
				1327	tokenLengths[c]=(int8_t)tokenLength;
				1328	}
				1329	} else {
				1330	tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
				1331	}
				1332	length+=tokenLength;
				1333	}
				1334	}
				1335	}
				1336
				1337	*pLine=line;
				1338	return length;
				1339	}
				1340
				1341	static void
				1342	calcGroupNameSetsLengths(int32_t maxNameLength) {
				1343	uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
				1344
				1345	uint16_t tokens=(uint16_t )uCharNames+8;
				1346	uint16_t tokenCount=*tokens++;
				1347	uint8_t tokenStrings=(uint8_t )uCharNames+uCharNames->tokenStringOffset;
				1348
				1349	int8_t *tokenLengths;
				1350
				1351	const uint16_t *group;
				1352	const uint8_t s, line, *lineLimit;
				1353
				1354	int32_t groupCount, lineNumber, length;
				1355
				1356	tokenLengths=(int8_t *)uprv_malloc(tokenCount);
				1357	if(tokenLengths!=NULL) {
				1358	uprv_memset(tokenLengths, 0, tokenCount);
				1359	}
				1360
				1361	group=GET_GROUPS(uCharNames);
				1362	groupCount=*group++;
				1363
				1364	/* enumerate all groups */
				1365	while(groupCount>0) {
				1366	s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
				1367	s=expandGroupLengths(s, offsets, lengths);
				1368
				1369	/* enumerate all lines in each group */
				1370	for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
				1371	line=s+offsets[lineNumber];
				1372	length=lengths[lineNumber];
				1373	if(length==0) {
				1374	continue;
				1375	}
				1376
				1377	lineLimit=line+length;
				1378
				1379	/* read regular name */
				1380	length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
				1381	if(length>maxNameLength) {
				1382	maxNameLength=length;
				1383	}
				1384	if(line==lineLimit) {
				1385	continue;
				1386	}
				1387
				1388	/* read Unicode 1.0 name */
				1389	length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
				1390	if(length>maxNameLength) {
				1391	maxNameLength=length;
				1392	}
				1393	if(line==lineLimit) {
				1394	continue;
				1395	}
				1396
				1397	/* read ISO comment */
				1398	/length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);/
				1399	}
				1400
				1401	group=NEXT_GROUP(group);
				1402	--groupCount;
				1403	}
				1404
				1405	if(tokenLengths!=NULL) {
				1406	uprv_free(tokenLengths);
				1407	}
				1408
				1409	/* set gMax... - name length last for threading */
				1410	gMaxNameLength=maxNameLength;
				1411	}
				1412
				1413	static UBool
				1414	calcNameSetsLengths(UErrorCode *pErrorCode) {
				1415	static const char extChars[]="0123456789ABCDEF<>-";
				1416	int32_t i, maxNameLength;
				1417
				1418	if(gMaxNameLength!=0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1419	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1420	}
				1421
				1422	if(!isDataLoaded(pErrorCode)) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1423	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1424	}
				1425
				1426	/* set hex digits, used in various names, and <>-, used in extended names */
				1427	for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
				1428	SET_ADD(gNameSet, extChars[i]);
				1429	}
				1430
				1431	/* set sets and lengths from algorithmic names */
				1432	maxNameLength=calcAlgNameSetsLengths(0);
				1433
				1434	/* set sets and lengths from extended names */
				1435	maxNameLength=calcExtNameSetsLengths(maxNameLength);
				1436
				1437	/* set sets and lengths from group names, set global maximum values */
				1438	calcGroupNameSetsLengths(maxNameLength);
				1439
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1440	return true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1441	}
				1442
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1443	U_NAMESPACE_END
				1444
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1445	/* public API --------------------------------------------------------------- */
				1446
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1447	U_NAMESPACE_USE
				1448
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1449	U_CAPI int32_t U_EXPORT2
				1450	u_charName(UChar32 code, UCharNameChoice nameChoice,
				1451	char *buffer, int32_t bufferLength,
				1452	UErrorCode *pErrorCode) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1453	AlgorithmicRange *algRange;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1454	uint32_t *p;
				1455	uint32_t i;
				1456	int32_t length;
				1457
				1458	/* check the argument values */
				1459	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1460	return 0;
				1461	} else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT \|\|
				1462	bufferLength<0 \|\| (bufferLength>0 && buffer==NULL)
				1463	) {
				1464	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1465	return 0;
				1466	}
				1467
				1468	if((uint32_t)code>UCHAR_MAX_VALUE \|\| !isDataLoaded(pErrorCode)) {
				1469	return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
				1470	}
				1471
				1472	length=0;
				1473
				1474	/* try algorithmic names first */
				1475	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1476	i=*p;
				1477	algRange=(AlgorithmicRange *)(p+1);
				1478	while(i>0) {
				1479	if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
				1480	length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
				1481	break;
				1482	}
				1483	algRange=(AlgorithmicRange )((uint8_t )algRange+algRange->size);
				1484	--i;
				1485	}
				1486
				1487	if(i==0) {
				1488	if (nameChoice == U_EXTENDED_CHAR_NAME) {
				1489	length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
				1490	if (!length) {
				1491	/* extended character name */
				1492	length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
				1493	}
				1494	} else {
				1495	/* normal character name */
				1496	length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
				1497	}
				1498	}
				1499
				1500	return u_terminateChars(buffer, bufferLength, length, pErrorCode);
				1501	}
				1502
				1503	U_CAPI int32_t U_EXPORT2
				1504	u_getISOComment(UChar32 /c/,
				1505	char *dest, int32_t destCapacity,
				1506	UErrorCode *pErrorCode) {
				1507	/* check the argument values */
				1508	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1509	return 0;
				1510	} else if(destCapacity<0 \|\| (destCapacity>0 && dest==NULL)) {
				1511	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1512	return 0;
				1513	}
				1514
				1515	return u_terminateChars(dest, destCapacity, 0, pErrorCode);
				1516	}
				1517
				1518	U_CAPI UChar32 U_EXPORT2
				1519	u_charFromName(UCharNameChoice nameChoice,
				1520	const char *name,
				1521	UErrorCode *pErrorCode) {
Frank Tang	f222396	2020-04-27 18:25:29 -0700	[diff] [blame]	1522	char upper[120] = {0};
				1523	char lower[120] = {0};
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1524	FindName findName;
				1525	AlgorithmicRange *algRange;
				1526	uint32_t *p;
				1527	uint32_t i;
				1528	UChar32 cp = 0;
				1529	char c0;
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1530	static constexpr UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1531
				1532	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1533	return error;
				1534	}
				1535
				1536	if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT \|\| name==NULL \|\| *name==0) {
				1537	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1538	return error;
				1539	}
				1540
				1541	if(!isDataLoaded(pErrorCode)) {
				1542	return error;
				1543	}
				1544
				1545	/* construct the uppercase and lowercase of the name first */
				1546	for(i=0; i<sizeof(upper); ++i) {
				1547	if((c0=*name++)!=0) {
				1548	upper[i]=uprv_toupper(c0);
				1549	lower[i]=uprv_tolower(c0);
				1550	} else {
				1551	upper[i]=lower[i]=0;
				1552	break;
				1553	}
				1554	}
				1555	if(i==sizeof(upper)) {
				1556	/* name too long, there is no such character */
				1557	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1558	return error;
				1559	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1560	// i==strlen(name)==strlen(lower)==strlen(upper)
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1561
				1562	/* try extended names first */
				1563	if (lower[0] == '<') {
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1564	if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1565	// Parse a string like "<category-HHHH>" where HHHH is a hex code point.
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1566	uint32_t limit = i;
				1567	while (i >= 3 && lower[--i] != '-') {}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1568
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1569	// There should be 1 to 8 hex digits.
				1570	int32_t hexLength = limit - (i + 1);
				1571	if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) {
				1572	uint32_t cIdx;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1573
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1574	lower[i] = 0;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1575
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1576	for (++i; i < limit; ++i) {
				1577	if (lower[i] >= '0' && lower[i] <= '9') {
				1578	cp = (cp << 4) + lower[i] - '0';
				1579	} else if (lower[i] >= 'a' && lower[i] <= 'f') {
				1580	cp = (cp << 4) + lower[i] - 'a' + 10;
				1581	} else {
				1582	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1583	return error;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1584	}
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1585	// Prevent signed-integer overflow and out-of-range code points.
				1586	if (cp > UCHAR_MAX_VALUE) {
				1587	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1588	return error;
				1589	}
				1590	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1591
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1592	/* Now validate the category name.
				1593	We could use a binary search, or a trie, if
				1594	we really wanted to. */
				1595	uint8_t cat = getCharCat(cp);
				1596	for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1597
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1598	if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
				1599	if (cat == cIdx) {
				1600	return cp;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1601	}
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	1602	break;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1603	}
				1604	}
				1605	}
				1606	}
				1607
				1608	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1609	return error;
				1610	}
				1611
				1612	/* try algorithmic names now */
				1613	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1614	i=*p;
				1615	algRange=(AlgorithmicRange *)(p+1);
				1616	while(i>0) {
				1617	if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
				1618	return cp;
				1619	}
				1620	algRange=(AlgorithmicRange )((uint8_t )algRange+algRange->size);
				1621	--i;
				1622	}
				1623
				1624	/* normal character name */
				1625	findName.otherName=upper;
				1626	findName.code=error;
				1627	enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
				1628	if (findName.code == error) {
				1629	*pErrorCode = U_ILLEGAL_CHAR_FOUND;
				1630	}
				1631	return findName.code;
				1632	}
				1633
				1634	U_CAPI void U_EXPORT2
				1635	u_enumCharNames(UChar32 start, UChar32 limit,
				1636	UEnumCharNamesFn *fn,
				1637	void *context,
				1638	UCharNameChoice nameChoice,
				1639	UErrorCode *pErrorCode) {
				1640	AlgorithmicRange *algRange;
				1641	uint32_t *p;
				1642	uint32_t i;
				1643
				1644	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1645	return;
				1646	}
				1647
				1648	if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT \|\| fn==NULL) {
				1649	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1650	return;
				1651	}
				1652
				1653	if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
				1654	limit = UCHAR_MAX_VALUE + 1;
				1655	}
				1656	if((uint32_t)start>=(uint32_t)limit) {
				1657	return;
				1658	}
				1659
				1660	if(!isDataLoaded(pErrorCode)) {
				1661	return;
				1662	}
				1663
				1664	/* interleave the data-driven ones with the algorithmic ones */
				1665	/* iterate over all algorithmic ranges; assume that they are in ascending order */
				1666	p=(uint32_t )((uint8_t )uCharNames+uCharNames->algNamesOffset);
				1667	i=*p;
				1668	algRange=(AlgorithmicRange *)(p+1);
				1669	while(i>0) {
				1670	/* enumerate the character names before the current algorithmic range */
				1671	/* here: start<limit */
				1672	if((uint32_t)start<algRange->start) {
				1673	if((uint32_t)limit<=algRange->start) {
				1674	enumNames(uCharNames, start, limit, fn, context, nameChoice);
				1675	return;
				1676	}
				1677	if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
				1678	return;
				1679	}
				1680	start=(UChar32)algRange->start;
				1681	}
				1682	/* enumerate the character names in the current algorithmic range */
				1683	/* here: algRange->start<=start<limit */
				1684	if((uint32_t)start<=algRange->end) {
				1685	if((uint32_t)limit<=(algRange->end+1)) {
				1686	enumAlgNames(algRange, start, limit, fn, context, nameChoice);
				1687	return;
				1688	}
				1689	if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
				1690	return;
				1691	}
				1692	start=(UChar32)algRange->end+1;
				1693	}
				1694	/* continue to the next algorithmic range (here: start<limit) */
				1695	algRange=(AlgorithmicRange )((uint8_t )algRange+algRange->size);
				1696	--i;
				1697	}
				1698	/* enumerate the character names after the last algorithmic range */
				1699	enumNames(uCharNames, start, limit, fn, context, nameChoice);
				1700	}
				1701
				1702	U_CAPI int32_t U_EXPORT2
				1703	uprv_getMaxCharNameLength() {
				1704	UErrorCode errorCode=U_ZERO_ERROR;
				1705	if(calcNameSetsLengths(&errorCode)) {
				1706	return gMaxNameLength;
				1707	} else {
				1708	return 0;
				1709	}
				1710	}
				1711
				1712	/**
				1713	* Converts the char set cset into a Unicode set uset.
				1714	* @param cset Set of 256 bit flags corresponding to a set of chars.
				1715	* @param uset USet to receive characters. Existing contents are deleted.
				1716	*/
				1717	static void
				1718	charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
				1719	UChar us[256];
				1720	char cs[256];
				1721
				1722	int32_t i, length;
				1723	UErrorCode errorCode;
				1724
				1725	errorCode=U_ZERO_ERROR;
				1726
				1727	if(!calcNameSetsLengths(&errorCode)) {
				1728	return;
				1729	}
				1730
				1731	/* build a char string with all chars that are used in character names */
				1732	length=0;
				1733	for(i=0; i<256; ++i) {
				1734	if(SET_CONTAINS(cset, i)) {
				1735	cs[length++]=(char)i;
				1736	}
				1737	}
				1738
				1739	/* convert the char string to a UChar string */
				1740	u_charsToUChars(cs, us, length);
				1741
				1742	/* add each UChar to the USet */
				1743	for(i=0; i<length; ++i) {
				1744	if(us[i]!=0 \|\| cs[i]==0) { /* non-invariant chars become (UChar)0 */
				1745	sa->add(sa->set, us[i]);
				1746	}
				1747	}
				1748	}
				1749
				1750	/**
				1751	* Fills set with characters that are used in Unicode character names.
				1752	* @param set USet to receive characters.
				1753	*/
				1754	U_CAPI void U_EXPORT2
				1755	uprv_getCharNameCharacters(const USetAdder *sa) {
				1756	charSetToUSet(gNameSet, sa);
				1757	}
				1758
				1759	/* data swapping ------------------------------------------------------------ */
				1760
				1761	/*
				1762	* The token table contains non-negative entries for token bytes,
				1763	* and -1 for bytes that represent themselves in the data file's charset.
				1764	* -2 entries are used for lead bytes.
				1765	*
				1766	* Direct bytes (-1 entries) must be translated from the input charset family
				1767	* to the output charset family.
				1768	* makeTokenMap() writes a permutation mapping for this.
				1769	* Use it once for single-/lead-byte tokens and once more for all trail byte
				1770	* tokens. (';' is an unused trail byte marked with -1.)
				1771	*/
				1772	static void
				1773	makeTokenMap(const UDataSwapper *ds,
				1774	int16_t tokens[], uint16_t tokenCount,
				1775	uint8_t map[256],
				1776	UErrorCode *pErrorCode) {
				1777	UBool usedOutChar[256];
				1778	uint16_t i, j;
				1779	uint8_t c1, c2;
				1780
				1781	if(U_FAILURE(*pErrorCode)) {
				1782	return;
				1783	}
				1784
				1785	if(ds->inCharset==ds->outCharset) {
				1786	/* Same charset family: identity permutation */
				1787	for(i=0; i<256; ++i) {
				1788	map[i]=(uint8_t)i;
				1789	}
				1790	} else {
				1791	uprv_memset(map, 0, 256);
				1792	uprv_memset(usedOutChar, 0, 256);
				1793
				1794	if(tokenCount>256) {
				1795	tokenCount=256;
				1796	}
				1797
				1798	/* set the direct bytes (byte 0 always maps to itself) */
				1799	for(i=1; i<tokenCount; ++i) {
				1800	if(tokens[i]==-1) {
				1801	/* convert the direct byte character */
				1802	c1=(uint8_t)i;
				1803	ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
				1804	if(U_FAILURE(*pErrorCode)) {
				1805	udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
				1806	i, ds->inCharset);
				1807	return;
				1808	}
				1809
				1810	/* enter the converted character into the map and mark it used */
				1811	map[c1]=c2;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1812	usedOutChar[c2]=true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1813	}
				1814	}
				1815
				1816	/* set the mappings for the rest of the permutation */
				1817	for(i=j=1; i<tokenCount; ++i) {
				1818	/* set mappings that were not set for direct bytes */
				1819	if(map[i]==0) {
				1820	/* set an output byte value that was not used as an output byte above */
				1821	while(usedOutChar[j]) {
				1822	++j;
				1823	}
				1824	map[i]=(uint8_t)j++;
				1825	}
				1826	}
				1827
				1828	/*
				1829	* leave mappings at tokenCount and above unset if tokenCount<256
				1830	* because they won't be used
				1831	*/
				1832	}
				1833	}
				1834
				1835	U_CAPI int32_t U_EXPORT2
				1836	uchar_swapNames(const UDataSwapper *ds,
				1837	const void inData, int32_t length, void outData,
				1838	UErrorCode *pErrorCode) {
				1839	const UDataInfo *pInfo;
				1840	int32_t headerSize;
				1841
				1842	const uint8_t *inBytes;
				1843	uint8_t *outBytes;
				1844
				1845	uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
				1846	offset, i, count, stringsCount;
				1847
				1848	const AlgorithmicRange *inRange;
				1849	AlgorithmicRange *outRange;
				1850
				1851	/* udata_swapDataHeader checks the arguments */
				1852	headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
				1853	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1854	return 0;
				1855	}
				1856
				1857	/* check data format and format version */
				1858	pInfo=(const UDataInfo )((const char )inData+4);
				1859	if(!(
				1860	pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
				1861	pInfo->dataFormat[1]==0x6e &&
				1862	pInfo->dataFormat[2]==0x61 &&
				1863	pInfo->dataFormat[3]==0x6d &&
				1864	pInfo->formatVersion[0]==1
				1865	)) {
				1866	udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
				1867	pInfo->dataFormat[0], pInfo->dataFormat[1],
				1868	pInfo->dataFormat[2], pInfo->dataFormat[3],
				1869	pInfo->formatVersion[0]);
				1870	*pErrorCode=U_UNSUPPORTED_ERROR;
				1871	return 0;
				1872	}
				1873
				1874	inBytes=(const uint8_t *)inData+headerSize;
				1875	outBytes=(uint8_t *)outData+headerSize;
				1876	if(length<0) {
				1877	algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
				1878	} else {
				1879	length-=headerSize;
				1880	if( length<20 \|\|
				1881	(uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
				1882	) {
				1883	udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
				1884	length);
				1885	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				1886	return 0;
				1887	}
				1888	}
				1889
				1890	if(length<0) {
				1891	/* preflighting: iterate through algorithmic ranges */
				1892	offset=algNamesOffset;
				1893	count=ds->readUInt32(((const uint32_t )(inBytes+offset)));
				1894	offset+=4;
				1895
				1896	for(i=0; i<count; ++i) {
				1897	inRange=(const AlgorithmicRange *)(inBytes+offset);
				1898	offset+=ds->readUInt16(inRange->size);
				1899	}
				1900	} else {
				1901	/* swap data */
				1902	const uint16_t *p;
				1903	uint16_t q, temp;
				1904
				1905	int16_t tokens[512];
				1906	uint16_t tokenCount;
				1907
				1908	uint8_t map[256], trailMap[256];
				1909
				1910	/* copy the data for inaccessible bytes */
				1911	if(inBytes!=outBytes) {
				1912	uprv_memcpy(outBytes, inBytes, length);
				1913	}
				1914
				1915	/* the initial 4 offsets first */
				1916	tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
				1917	groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
				1918	groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
				1919	ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
				1920
				1921	/*
				1922	* now the tokens table
				1923	* it needs to be permutated along with the compressed name strings
				1924	*/
				1925	p=(const uint16_t *)(inBytes+16);
				1926	q=(uint16_t *)(outBytes+16);
				1927
				1928	/* read and swap the tokenCount */
				1929	tokenCount=ds->readUInt16(*p);
				1930	ds->swapArray16(ds, p, 2, q, pErrorCode);
				1931	++p;
				1932	++q;
				1933
				1934	/* read the first 512 tokens and make the token maps */
				1935	if(tokenCount<=512) {
				1936	count=tokenCount;
				1937	} else {
				1938	count=512;
				1939	}
				1940	for(i=0; i<count; ++i) {
				1941	tokens[i]=udata_readInt16(ds, p[i]);
				1942	}
				1943	for(; i<512; ++i) {
				1944	tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
				1945	}
				1946	makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
				1947	makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
				1948	if(U_FAILURE(*pErrorCode)) {
				1949	return 0;
				1950	}
				1951
				1952	/*
				1953	* swap and permutate the tokens
				1954	* go through a temporary array to support in-place swapping
				1955	*/
				1956	temp=(uint16_t )uprv_malloc(tokenCount2);
				1957	if(temp==NULL) {
				1958	udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
				1959	tokenCount);
				1960	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
				1961	return 0;
				1962	}
				1963
				1964	/* swap and permutate single-/lead-byte tokens */
				1965	for(i=0; i<tokenCount && i<256; ++i) {
				1966	ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
				1967	}
				1968
				1969	/* swap and permutate trail-byte tokens */
				1970	for(; i<tokenCount; ++i) {
				1971	ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
				1972	}
				1973
				1974	/* copy the result into the output and free the temporary array */
				1975	uprv_memcpy(q, temp, tokenCount*2);
				1976	uprv_free(temp);
				1977
				1978	/*
				1979	* swap the token strings but not a possible padding byte after
				1980	* the terminating NUL of the last string
				1981	*/
				1982	udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
				1983	outBytes+tokenStringOffset, pErrorCode);
				1984	if(U_FAILURE(*pErrorCode)) {
				1985	udata_printError(ds, "uchar_swapNames(token strings) failed\n");
				1986	return 0;
				1987	}
				1988
				1989	/* swap the group table */
				1990	count=ds->readUInt16(((const uint16_t )(inBytes+groupsOffset)));
				1991	ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count3)2),
				1992	outBytes+groupsOffset, pErrorCode);
				1993
				1994	/*
				1995	* swap the group strings
				1996	* swap the string bytes but not the nibble-encoded string lengths
				1997	*/
				1998	if(ds->inCharset!=ds->outCharset) {
				1999	uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
				2000
				2001	const uint8_t inStrings, nextInStrings;
				2002	uint8_t *outStrings;
				2003
				2004	uint8_t c;
				2005
				2006	inStrings=inBytes+groupStringOffset;
				2007	outStrings=outBytes+groupStringOffset;
				2008
				2009	stringsCount=algNamesOffset-groupStringOffset;
				2010
				2011	/* iterate through string groups until only a few padding bytes are left */
				2012	while(stringsCount>32) {
				2013	nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
				2014
				2015	/* move past the length bytes */
				2016	stringsCount-=(uint32_t)(nextInStrings-inStrings);
				2017	outStrings+=nextInStrings-inStrings;
				2018	inStrings=nextInStrings;
				2019
				2020	count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
				2021	stringsCount-=count;
				2022
				2023	/* swap the string bytes using map[] and trailMap[] */
				2024	while(count>0) {
				2025	c=*inStrings++;
				2026	*outStrings++=map[c];
				2027	if(tokens[c]!=-2) {
				2028	--count;
				2029	} else {
				2030	/* token lead byte: swap the trail byte, too */
				2031	outStrings++=trailMap[inStrings++];
				2032	count-=2;
				2033	}
				2034	}
				2035	}
				2036	}
				2037
				2038	/* swap the algorithmic ranges */
				2039	offset=algNamesOffset;
				2040	count=ds->readUInt32(((const uint32_t )(inBytes+offset)));
				2041	ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
				2042	offset+=4;
				2043
				2044	for(i=0; i<count; ++i) {
				2045	if(offset>(uint32_t)length) {
				2046	udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
				2047	length, i);
				2048	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				2049	return 0;
				2050	}
				2051
				2052	inRange=(const AlgorithmicRange *)(inBytes+offset);
				2053	outRange=(AlgorithmicRange *)(outBytes+offset);
				2054	offset+=ds->readUInt16(inRange->size);
				2055
				2056	ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
				2057	ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
				2058	switch(inRange->type) {
				2059	case 0:
				2060	/* swap prefix string */
				2061	ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
				2062	outRange+1, pErrorCode);
				2063	if(U_FAILURE(*pErrorCode)) {
				2064	udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
				2065	i);
				2066	return 0;
				2067	}
				2068	break;
				2069	case 1:
				2070	{
				2071	/* swap factors and the prefix and factor strings */
				2072	uint32_t factorsCount;
				2073
				2074	factorsCount=inRange->variant;
				2075	p=(const uint16_t *)(inRange+1);
				2076	q=(uint16_t *)(outRange+1);
				2077	ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
				2078
				2079	/* swap the strings, up to the last terminating NUL */
				2080	p+=factorsCount;
				2081	q+=factorsCount;
				2082	stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
				2083	while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
				2084	--stringsCount;
				2085	}
				2086	ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
				2087	}
				2088	break;
				2089	default:
				2090	udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
				2091	inRange->type, i);
				2092	*pErrorCode=U_UNSUPPORTED_ERROR;
				2093	return 0;
				2094	}
				2095	}
				2096	}
				2097
				2098	return headerSize+(int32_t)offset;
				2099	}
				2100
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	2101	/*
				2102	* Hey, Emacs, please set the following:
				2103	*
				2104	* Local Variables:
				2105	* indent-tabs-mode: nil
				2106	* End:
				2107	*
				2108	*/