Blame - source/common/ucnv_u16.cpp - chromium.googlesource.com/chromium/deps/icu

blob: bebdede4c440ca6b25e369459f5bae8051963353 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	**********************************************************************
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	5	* Copyright (C) 2002-2015, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	* file name: ucnv_u16.c
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	9	* encoding: UTF-8
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	10	* tab size: 8 (not used)
				11	* indentation:4
				12	*
				13	* created on: 2002jul01
				14	* created by: Markus W. Scherer
				15	*
				16	* UTF-16 converter implementation. Used to be in ucnv_utf.c.
				17	*/
				18
				19	#include "unicode/utypes.h"
				20
				21	#if !UCONFIG_NO_CONVERSION
				22
				23	#include "unicode/ucnv.h"
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	24	#include "unicode/uversion.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	25	#include "ucnv_bld.h"
				26	#include "ucnv_cnv.h"
				27	#include "cmemory.h"
				28
				29	enum {
				30	UCNV_NEED_TO_WRITE_BOM=1
				31	};
				32
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	33	U_CDECL_BEGIN
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	34	/*
				35	* The UTF-16 toUnicode implementation is also used for the Java-specific
				36	* "with BOM" variants of UTF-16BE and UTF-16LE.
				37	*/
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	38	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	39	_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				40	UErrorCode *pErrorCode);
				41
				42	/* UTF-16BE ----------------------------------------------------------------- */
				43
				44	#if U_IS_BIG_ENDIAN
				45	# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
				46	#else
				47	# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
				48	#endif
				49
				50
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	51	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	52	_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				53	UErrorCode *pErrorCode) {
				54	UConverter *cnv;
				55	const UChar *source;
				56	char *target;
				57	int32_t *offsets;
				58
				59	uint32_t targetCapacity, length, sourceIndex;
				60	UChar c, trail;
				61	char overflow[4];
				62
				63	source=pArgs->source;
				64	length=(int32_t)(pArgs->sourceLimit-source);
				65	if(length<=0) {
				66	/* no input, nothing to do */
				67	return;
				68	}
				69
				70	cnv=pArgs->converter;
				71
				72	/* write the BOM if necessary */
				73	if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	74	static const char bom[]={ (char)0xfeu, (char)0xffu };
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	75	ucnv_fromUWriteBytes(cnv,
				76	bom, 2,
				77	&pArgs->target, pArgs->targetLimit,
				78	&pArgs->offsets, -1,
				79	pErrorCode);
				80	cnv->fromUnicodeStatus=0;
				81	}
				82
				83	target=pArgs->target;
				84	if(target >= pArgs->targetLimit) {
				85	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				86	return;
				87	}
				88
				89	targetCapacity=(uint32_t)(pArgs->targetLimit-target);
				90	offsets=pArgs->offsets;
				91	sourceIndex=0;
				92
				93	/* c!=0 indicates in several places outside the main loops that a surrogate was found */
				94
				95	if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
				96	/* the last buffer ended with a lead surrogate, output the surrogate pair */
				97	++source;
				98	--length;
				99	target[0]=(uint8_t)(c>>8);
				100	target[1]=(uint8_t)c;
				101	target[2]=(uint8_t)(trail>>8);
				102	target[3]=(uint8_t)trail;
				103	target+=4;
				104	targetCapacity-=4;
				105	if(offsets!=NULL) {
				106	*offsets++=-1;
				107	*offsets++=-1;
				108	*offsets++=-1;
				109	*offsets++=-1;
				110	}
				111	sourceIndex=1;
				112	cnv->fromUChar32=c=0;
				113	}
				114
				115	if(c==0) {
				116	/* copy an even number of bytes for complete UChars */
				117	uint32_t count=2*length;
				118	if(count>targetCapacity) {
				119	count=targetCapacity&~1;
				120	}
				121	/* count is even */
				122	targetCapacity-=count;
				123	count>>=1;
				124	length-=count;
				125
				126	if(offsets==NULL) {
				127	while(count>0) {
				128	c=*source++;
				129	if(U16_IS_SINGLE(c)) {
				130	target[0]=(uint8_t)(c>>8);
				131	target[1]=(uint8_t)c;
				132	target+=2;
				133	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
				134	++source;
				135	--count;
				136	target[0]=(uint8_t)(c>>8);
				137	target[1]=(uint8_t)c;
				138	target[2]=(uint8_t)(trail>>8);
				139	target[3]=(uint8_t)trail;
				140	target+=4;
				141	} else {
				142	break;
				143	}
				144	--count;
				145	}
				146	} else {
				147	while(count>0) {
				148	c=*source++;
				149	if(U16_IS_SINGLE(c)) {
				150	target[0]=(uint8_t)(c>>8);
				151	target[1]=(uint8_t)c;
				152	target+=2;
				153	*offsets++=sourceIndex;
				154	*offsets++=sourceIndex++;
				155	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
				156	++source;
				157	--count;
				158	target[0]=(uint8_t)(c>>8);
				159	target[1]=(uint8_t)c;
				160	target[2]=(uint8_t)(trail>>8);
				161	target[3]=(uint8_t)trail;
				162	target+=4;
				163	*offsets++=sourceIndex;
				164	*offsets++=sourceIndex;
				165	*offsets++=sourceIndex;
				166	*offsets++=sourceIndex;
				167	sourceIndex+=2;
				168	} else {
				169	break;
				170	}
				171	--count;
				172	}
				173	}
				174
				175	if(count==0) {
				176	/* done with the loop for complete UChars */
				177	if(length>0 && targetCapacity>0) {
				178	/*
				179	* there is more input and some target capacity -
				180	* it must be targetCapacity==1 because otherwise
				181	* the above would have copied more;
				182	* prepare for overflow output
				183	*/
				184	if(U16_IS_SINGLE(c=*source++)) {
				185	overflow[0]=(char)(c>>8);
				186	overflow[1]=(char)c;
				187	length=2; /* 2 bytes to output */
				188	c=0;
				189	/* } else { keep c for surrogate handling, length will be set there */
				190	}
				191	} else {
				192	length=0;
				193	c=0;
				194	}
				195	} else {
				196	/* keep c for surrogate handling, length will be set there */
				197	targetCapacity+=2*count;
				198	}
				199	} else {
				200	length=0; /* from here on, length counts the bytes in overflow[] */
				201	}
				202
				203	if(c!=0) {
				204	/*
				205	* c is a surrogate, and
				206	* - source or target too short
				207	* - or the surrogate is unmatched
				208	*/
				209	length=0;
				210	if(U16_IS_SURROGATE_LEAD(c)) {
				211	if(source<pArgs->sourceLimit) {
				212	if(U16_IS_TRAIL(trail=*source)) {
				213	/* output the surrogate pair, will overflow (see conditions comment above) */
				214	++source;
				215	overflow[0]=(char)(c>>8);
				216	overflow[1]=(char)c;
				217	overflow[2]=(char)(trail>>8);
				218	overflow[3]=(char)trail;
				219	length=4; /* 4 bytes to output */
				220	c=0;
				221	} else {
				222	/* unmatched lead surrogate */
				223	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				224	}
				225	} else {
				226	/* see if the trail surrogate is in the next buffer */
				227	}
				228	} else {
				229	/* unmatched trail surrogate */
				230	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				231	}
				232	cnv->fromUChar32=c;
				233	}
				234
				235	if(length>0) {
				236	/* output length bytes with overflow (length>targetCapacity>0) */
				237	ucnv_fromUWriteBytes(cnv,
				238	overflow, length,
				239	(char **)&target, pArgs->targetLimit,
				240	&offsets, sourceIndex,
				241	pErrorCode);
				242	targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
				243	}
				244
				245	if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
				246	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				247	}
				248
				249	/* write back the updated pointers */
				250	pArgs->source=source;
				251	pArgs->target=(char *)target;
				252	pArgs->offsets=offsets;
				253	}
				254
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	255	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	256	_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				257	UErrorCode *pErrorCode) {
				258	UConverter *cnv;
				259	const uint8_t *source;
				260	UChar *target;
				261	int32_t *offsets;
				262
				263	uint32_t targetCapacity, length, count, sourceIndex;
				264	UChar c, trail;
				265
				266	if(pArgs->converter->mode<8) {
				267	_UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
				268	return;
				269	}
				270
				271	cnv=pArgs->converter;
				272	source=(const uint8_t *)pArgs->source;
				273	length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
				274	if(length<=0 && cnv->toUnicodeStatus==0) {
				275	/* no input, nothing to do */
				276	return;
				277	}
				278
				279	target=pArgs->target;
				280	if(target >= pArgs->targetLimit) {
				281	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				282	return;
				283	}
				284
				285	targetCapacity=(uint32_t)(pArgs->targetLimit-target);
				286	offsets=pArgs->offsets;
				287	sourceIndex=0;
				288	c=0;
				289
				290	/* complete a partial UChar or pair from the last call */
				291	if(cnv->toUnicodeStatus!=0) {
				292	/*
				293	* special case: single byte from a previous buffer,
				294	* where the byte turned out not to belong to a trail surrogate
				295	* and the preceding, unmatched lead surrogate was put into toUBytes[]
				296	* for error handling
				297	*/
				298	cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
				299	cnv->toULength=1;
				300	cnv->toUnicodeStatus=0;
				301	}
				302	if((count=cnv->toULength)!=0) {
				303	uint8_t *p=cnv->toUBytes;
				304	do {
				305	p[count++]=*source++;
				306	++sourceIndex;
				307	--length;
				308	if(count==2) {
				309	c=((UChar)p[0]<<8)\|p[1];
				310	if(U16_IS_SINGLE(c)) {
				311	/* output the BMP code point */
				312	*target++=c;
				313	if(offsets!=NULL) {
				314	*offsets++=-1;
				315	}
				316	--targetCapacity;
				317	count=0;
				318	c=0;
				319	break;
				320	} else if(U16_IS_SURROGATE_LEAD(c)) {
				321	/* continue collecting bytes for the trail surrogate */
				322	c=0; /* avoid unnecessary surrogate handling below */
				323	} else {
				324	/* fall through to error handling for an unmatched trail surrogate */
				325	break;
				326	}
				327	} else if(count==4) {
				328	c=((UChar)p[0]<<8)\|p[1];
				329	trail=((UChar)p[2]<<8)\|p[3];
				330	if(U16_IS_TRAIL(trail)) {
				331	/* output the surrogate pair */
				332	*target++=c;
				333	if(targetCapacity>=2) {
				334	*target++=trail;
				335	if(offsets!=NULL) {
				336	*offsets++=-1;
				337	*offsets++=-1;
				338	}
				339	targetCapacity-=2;
				340	} else /* targetCapacity==1 */ {
				341	targetCapacity=0;
				342	cnv->UCharErrorBuffer[0]=trail;
				343	cnv->UCharErrorBufferLength=1;
				344	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				345	}
				346	count=0;
				347	c=0;
				348	break;
				349	} else {
				350	/* unmatched lead surrogate, handle here for consistent toUBytes[] */
				351	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				352
				353	/* back out reading the code unit after it */
				354	if(((const uint8_t *)pArgs->source-source)>=2) {
				355	source-=2;
				356	} else {
				357	/*
				358	* if the trail unit's first byte was in a previous buffer, then
				359	* we need to put it into a special place because toUBytes[] will be
				360	* used for the lead unit's bytes
				361	*/
				362	cnv->toUnicodeStatus=0x100\|p[2];
				363	--source;
				364	}
				365	cnv->toULength=2;
				366
				367	/* write back the updated pointers */
				368	pArgs->source=(const char *)source;
				369	pArgs->target=target;
				370	pArgs->offsets=offsets;
				371	return;
				372	}
				373	}
				374	} while(length>0);
				375	cnv->toULength=(int8_t)count;
				376	}
				377
				378	/* copy an even number of bytes for complete UChars */
				379	count=2*targetCapacity;
				380	if(count>length) {
				381	count=length&~1;
				382	}
				383	if(c==0 && count>0) {
				384	length-=count;
				385	count>>=1;
				386	targetCapacity-=count;
				387	if(offsets==NULL) {
				388	do {
				389	c=((UChar)source[0]<<8)\|source[1];
				390	source+=2;
				391	if(U16_IS_SINGLE(c)) {
				392	*target++=c;
				393	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
				394	U16_IS_TRAIL(trail=((UChar)source[0]<<8)\|source[1])
				395	) {
				396	source+=2;
				397	--count;
				398	*target++=c;
				399	*target++=trail;
				400	} else {
				401	break;
				402	}
				403	} while(--count>0);
				404	} else {
				405	do {
				406	c=((UChar)source[0]<<8)\|source[1];
				407	source+=2;
				408	if(U16_IS_SINGLE(c)) {
				409	*target++=c;
				410	*offsets++=sourceIndex;
				411	sourceIndex+=2;
				412	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
				413	U16_IS_TRAIL(trail=((UChar)source[0]<<8)\|source[1])
				414	) {
				415	source+=2;
				416	--count;
				417	*target++=c;
				418	*target++=trail;
				419	*offsets++=sourceIndex;
				420	*offsets++=sourceIndex;
				421	sourceIndex+=4;
				422	} else {
				423	break;
				424	}
				425	} while(--count>0);
				426	}
				427
				428	if(count==0) {
				429	/* done with the loop for complete UChars */
				430	c=0;
				431	} else {
				432	/* keep c for surrogate handling, trail will be set there */
				433	length+=2(count-1); / one more byte pair was consumed than count decremented */
				434	targetCapacity+=count;
				435	}
				436	}
				437
				438	if(c!=0) {
				439	/*
				440	* c is a surrogate, and
				441	* - source or target too short
				442	* - or the surrogate is unmatched
				443	*/
				444	cnv->toUBytes[0]=(uint8_t)(c>>8);
				445	cnv->toUBytes[1]=(uint8_t)c;
				446	cnv->toULength=2;
				447
				448	if(U16_IS_SURROGATE_LEAD(c)) {
				449	if(length>=2) {
				450	if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)\|source[1])) {
				451	/* output the surrogate pair, will overflow (see conditions comment above) */
				452	source+=2;
				453	length-=2;
				454	*target++=c;
				455	if(offsets!=NULL) {
				456	*offsets++=sourceIndex;
				457	}
				458	cnv->UCharErrorBuffer[0]=trail;
				459	cnv->UCharErrorBufferLength=1;
				460	cnv->toULength=0;
				461	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				462	} else {
				463	/* unmatched lead surrogate */
				464	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				465	}
				466	} else {
				467	/* see if the trail surrogate is in the next buffer */
				468	}
				469	} else {
				470	/* unmatched trail surrogate */
				471	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				472	}
				473	}
				474
				475	if(U_SUCCESS(*pErrorCode)) {
				476	/* check for a remaining source byte */
				477	if(length>0) {
				478	if(targetCapacity==0) {
				479	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				480	} else {
				481	/* it must be length==1 because otherwise the above would have copied more */
				482	cnv->toUBytes[cnv->toULength++]=*source++;
				483	}
				484	}
				485	}
				486
				487	/* write back the updated pointers */
				488	pArgs->source=(const char *)source;
				489	pArgs->target=target;
				490	pArgs->offsets=offsets;
				491	}
				492
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	493	static UChar32 U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	494	_UTF16BEGetNextUChar(UConverterToUnicodeArgs pArgs, UErrorCode err) {
				495	const uint8_t s, sourceLimit;
				496	UChar32 c;
				497
				498	if(pArgs->converter->mode<8) {
				499	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
				500	}
				501
				502	s=(const uint8_t *)pArgs->source;
				503	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				504
				505	if(s>=sourceLimit) {
				506	/* no input */
				507	*err=U_INDEX_OUTOFBOUNDS_ERROR;
				508	return 0xffff;
				509	}
				510
				511	if(s+2>sourceLimit) {
				512	/* only one byte: truncated UChar */
				513	pArgs->converter->toUBytes[0]=*s++;
				514	pArgs->converter->toULength=1;
				515	pArgs->source=(const char *)s;
				516	*err = U_TRUNCATED_CHAR_FOUND;
				517	return 0xffff;
				518	}
				519
				520	/* get one UChar */
				521	c=((UChar32)*s<<8)\|s[1];
				522	s+=2;
				523
				524	/* check for a surrogate pair */
				525	if(U_IS_SURROGATE(c)) {
				526	if(U16_IS_SURROGATE_LEAD(c)) {
				527	if(s+2<=sourceLimit) {
				528	UChar trail;
				529
				530	/* get a second UChar and see if it is a trail surrogate */
				531	trail=((UChar)*s<<8)\|s[1];
				532	if(U16_IS_TRAIL(trail)) {
				533	c=U16_GET_SUPPLEMENTARY(c, trail);
				534	s+=2;
				535	} else {
				536	/* unmatched lead surrogate */
				537	c=-2;
				538	}
				539	} else {
				540	/* too few (2 or 3) bytes for a surrogate pair: truncated code point */
				541	uint8_t *bytes=pArgs->converter->toUBytes;
				542	s-=2;
				543	pArgs->converter->toULength=(int8_t)(sourceLimit-s);
				544	do {
				545	bytes++=s++;
				546	} while(s<sourceLimit);
				547
				548	c=0xffff;
				549	*err=U_TRUNCATED_CHAR_FOUND;
				550	}
				551	} else {
				552	/* unmatched trail surrogate */
				553	c=-2;
				554	}
				555
				556	if(c<0) {
				557	/* write the unmatched surrogate */
				558	uint8_t *bytes=pArgs->converter->toUBytes;
				559	pArgs->converter->toULength=2;
				560	bytes=(s-2);
				561	bytes[1]=*(s-1);
				562
				563	c=0xffff;
				564	*err=U_ILLEGAL_CHAR_FOUND;
				565	}
				566	}
				567
				568	pArgs->source=(const char *)s;
				569	return c;
				570	}
				571
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	572	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	573	_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
				574	if(choice<=UCNV_RESET_TO_UNICODE) {
				575	/* reset toUnicode state */
				576	if(UCNV_GET_VERSION(cnv)==0) {
				577	cnv->mode=8; /* no BOM handling */
				578	} else {
				579	cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
				580	}
				581	}
				582	if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
				583	/* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
				584	cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
				585	}
				586	}
				587
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	588	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	589	_UTF16BEOpen(UConverter *cnv,
				590	UConverterLoadArgs *pArgs,
				591	UErrorCode *pErrorCode) {
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	592	(void)pArgs;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	593	if(UCNV_GET_VERSION(cnv)<=1) {
				594	_UTF16BEReset(cnv, UCNV_RESET_BOTH);
				595	} else {
				596	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				597	}
				598	}
				599
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	600	static const char * U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	601	_UTF16BEGetName(const UConverter *cnv) {
				602	if(UCNV_GET_VERSION(cnv)==0) {
				603	return "UTF-16BE";
				604	} else {
				605	return "UTF-16BE,version=1";
				606	}
				607	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	608	U_CDECL_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	609
				610	static const UConverterImpl _UTF16BEImpl={
				611	UCNV_UTF16_BigEndian,
				612
				613	NULL,
				614	NULL,
				615
				616	_UTF16BEOpen,
				617	NULL,
				618	_UTF16BEReset,
				619
				620	_UTF16BEToUnicodeWithOffsets,
				621	_UTF16BEToUnicodeWithOffsets,
				622	_UTF16BEFromUnicodeWithOffsets,
				623	_UTF16BEFromUnicodeWithOffsets,
				624	_UTF16BEGetNextUChar,
				625
				626	NULL,
				627	_UTF16BEGetName,
				628	NULL,
				629	NULL,
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	630	ucnv_getNonSurrogateUnicodeSet,
				631
				632	NULL,
				633	NULL
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	634	};
				635
				636	static const UConverterStaticData _UTF16BEStaticData={
				637	sizeof(UConverterStaticData),
				638	"UTF-16BE",
				639	1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	640	{ 0xff, 0xfd, 0, 0 },2,false,false,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	641	0,
				642	0,
				643	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				644	};
				645
				646
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	647	const UConverterSharedData _UTF16BEData=
				648	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	649
				650	/* UTF-16LE ----------------------------------------------------------------- */
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	651	U_CDECL_BEGIN
				652	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	653	_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				654	UErrorCode *pErrorCode) {
				655	UConverter *cnv;
				656	const UChar *source;
				657	char *target;
				658	int32_t *offsets;
				659
				660	uint32_t targetCapacity, length, sourceIndex;
				661	UChar c, trail;
				662	char overflow[4];
				663
				664	source=pArgs->source;
				665	length=(int32_t)(pArgs->sourceLimit-source);
				666	if(length<=0) {
				667	/* no input, nothing to do */
				668	return;
				669	}
				670
				671	cnv=pArgs->converter;
				672
				673	/* write the BOM if necessary */
				674	if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
Jungshik Shin	42d5027	2018-10-24 01:22:09 -0700	[diff] [blame]	675	static const char bom[]={ (char)0xffu, (char)0xfeu };
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	676	ucnv_fromUWriteBytes(cnv,
				677	bom, 2,
				678	&pArgs->target, pArgs->targetLimit,
				679	&pArgs->offsets, -1,
				680	pErrorCode);
				681	cnv->fromUnicodeStatus=0;
				682	}
				683
				684	target=pArgs->target;
				685	if(target >= pArgs->targetLimit) {
				686	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				687	return;
				688	}
				689
				690	targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
				691	offsets=pArgs->offsets;
				692	sourceIndex=0;
				693
				694	/* c!=0 indicates in several places outside the main loops that a surrogate was found */
				695
				696	if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
				697	/* the last buffer ended with a lead surrogate, output the surrogate pair */
				698	++source;
				699	--length;
				700	target[0]=(uint8_t)c;
				701	target[1]=(uint8_t)(c>>8);
				702	target[2]=(uint8_t)trail;
				703	target[3]=(uint8_t)(trail>>8);
				704	target+=4;
				705	targetCapacity-=4;
				706	if(offsets!=NULL) {
				707	*offsets++=-1;
				708	*offsets++=-1;
				709	*offsets++=-1;
				710	*offsets++=-1;
				711	}
				712	sourceIndex=1;
				713	cnv->fromUChar32=c=0;
				714	}
				715
				716	if(c==0) {
				717	/* copy an even number of bytes for complete UChars */
				718	uint32_t count=2*length;
				719	if(count>targetCapacity) {
				720	count=targetCapacity&~1;
				721	}
				722	/* count is even */
				723	targetCapacity-=count;
				724	count>>=1;
				725	length-=count;
				726
				727	if(offsets==NULL) {
				728	while(count>0) {
				729	c=*source++;
				730	if(U16_IS_SINGLE(c)) {
				731	target[0]=(uint8_t)c;
				732	target[1]=(uint8_t)(c>>8);
				733	target+=2;
				734	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
				735	++source;
				736	--count;
				737	target[0]=(uint8_t)c;
				738	target[1]=(uint8_t)(c>>8);
				739	target[2]=(uint8_t)trail;
				740	target[3]=(uint8_t)(trail>>8);
				741	target+=4;
				742	} else {
				743	break;
				744	}
				745	--count;
				746	}
				747	} else {
				748	while(count>0) {
				749	c=*source++;
				750	if(U16_IS_SINGLE(c)) {
				751	target[0]=(uint8_t)c;
				752	target[1]=(uint8_t)(c>>8);
				753	target+=2;
				754	*offsets++=sourceIndex;
				755	*offsets++=sourceIndex++;
				756	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
				757	++source;
				758	--count;
				759	target[0]=(uint8_t)c;
				760	target[1]=(uint8_t)(c>>8);
				761	target[2]=(uint8_t)trail;
				762	target[3]=(uint8_t)(trail>>8);
				763	target+=4;
				764	*offsets++=sourceIndex;
				765	*offsets++=sourceIndex;
				766	*offsets++=sourceIndex;
				767	*offsets++=sourceIndex;
				768	sourceIndex+=2;
				769	} else {
				770	break;
				771	}
				772	--count;
				773	}
				774	}
				775
				776	if(count==0) {
				777	/* done with the loop for complete UChars */
				778	if(length>0 && targetCapacity>0) {
				779	/*
				780	* there is more input and some target capacity -
				781	* it must be targetCapacity==1 because otherwise
				782	* the above would have copied more;
				783	* prepare for overflow output
				784	*/
				785	if(U16_IS_SINGLE(c=*source++)) {
				786	overflow[0]=(char)c;
				787	overflow[1]=(char)(c>>8);
				788	length=2; /* 2 bytes to output */
				789	c=0;
				790	/* } else { keep c for surrogate handling, length will be set there */
				791	}
				792	} else {
				793	length=0;
				794	c=0;
				795	}
				796	} else {
				797	/* keep c for surrogate handling, length will be set there */
				798	targetCapacity+=2*count;
				799	}
				800	} else {
				801	length=0; /* from here on, length counts the bytes in overflow[] */
				802	}
				803
				804	if(c!=0) {
				805	/*
				806	* c is a surrogate, and
				807	* - source or target too short
				808	* - or the surrogate is unmatched
				809	*/
				810	length=0;
				811	if(U16_IS_SURROGATE_LEAD(c)) {
				812	if(source<pArgs->sourceLimit) {
				813	if(U16_IS_TRAIL(trail=*source)) {
				814	/* output the surrogate pair, will overflow (see conditions comment above) */
				815	++source;
				816	overflow[0]=(char)c;
				817	overflow[1]=(char)(c>>8);
				818	overflow[2]=(char)trail;
				819	overflow[3]=(char)(trail>>8);
				820	length=4; /* 4 bytes to output */
				821	c=0;
				822	} else {
				823	/* unmatched lead surrogate */
				824	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				825	}
				826	} else {
				827	/* see if the trail surrogate is in the next buffer */
				828	}
				829	} else {
				830	/* unmatched trail surrogate */
				831	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				832	}
				833	cnv->fromUChar32=c;
				834	}
				835
				836	if(length>0) {
				837	/* output length bytes with overflow (length>targetCapacity>0) */
				838	ucnv_fromUWriteBytes(cnv,
				839	overflow, length,
				840	&target, pArgs->targetLimit,
				841	&offsets, sourceIndex,
				842	pErrorCode);
				843	targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
				844	}
				845
				846	if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
				847	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				848	}
				849
				850	/* write back the updated pointers */
				851	pArgs->source=source;
				852	pArgs->target=target;
				853	pArgs->offsets=offsets;
				854	}
				855
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	856	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	857	_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				858	UErrorCode *pErrorCode) {
				859	UConverter *cnv;
				860	const uint8_t *source;
				861	UChar *target;
				862	int32_t *offsets;
				863
				864	uint32_t targetCapacity, length, count, sourceIndex;
				865	UChar c, trail;
				866
				867	if(pArgs->converter->mode<8) {
				868	_UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
				869	return;
				870	}
				871
				872	cnv=pArgs->converter;
				873	source=(const uint8_t *)pArgs->source;
				874	length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
				875	if(length<=0 && cnv->toUnicodeStatus==0) {
				876	/* no input, nothing to do */
				877	return;
				878	}
				879
				880	target=pArgs->target;
				881	if(target >= pArgs->targetLimit) {
				882	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				883	return;
				884	}
				885
				886	targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
				887	offsets=pArgs->offsets;
				888	sourceIndex=0;
				889	c=0;
				890
				891	/* complete a partial UChar or pair from the last call */
				892	if(cnv->toUnicodeStatus!=0) {
				893	/*
				894	* special case: single byte from a previous buffer,
				895	* where the byte turned out not to belong to a trail surrogate
				896	* and the preceding, unmatched lead surrogate was put into toUBytes[]
				897	* for error handling
				898	*/
				899	cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
				900	cnv->toULength=1;
				901	cnv->toUnicodeStatus=0;
				902	}
				903	if((count=cnv->toULength)!=0) {
				904	uint8_t *p=cnv->toUBytes;
				905	do {
				906	p[count++]=*source++;
				907	++sourceIndex;
				908	--length;
				909	if(count==2) {
				910	c=((UChar)p[1]<<8)\|p[0];
				911	if(U16_IS_SINGLE(c)) {
				912	/* output the BMP code point */
				913	*target++=c;
				914	if(offsets!=NULL) {
				915	*offsets++=-1;
				916	}
				917	--targetCapacity;
				918	count=0;
				919	c=0;
				920	break;
				921	} else if(U16_IS_SURROGATE_LEAD(c)) {
				922	/* continue collecting bytes for the trail surrogate */
				923	c=0; /* avoid unnecessary surrogate handling below */
				924	} else {
				925	/* fall through to error handling for an unmatched trail surrogate */
				926	break;
				927	}
				928	} else if(count==4) {
				929	c=((UChar)p[1]<<8)\|p[0];
				930	trail=((UChar)p[3]<<8)\|p[2];
				931	if(U16_IS_TRAIL(trail)) {
				932	/* output the surrogate pair */
				933	*target++=c;
				934	if(targetCapacity>=2) {
				935	*target++=trail;
				936	if(offsets!=NULL) {
				937	*offsets++=-1;
				938	*offsets++=-1;
				939	}
				940	targetCapacity-=2;
				941	} else /* targetCapacity==1 */ {
				942	targetCapacity=0;
				943	cnv->UCharErrorBuffer[0]=trail;
				944	cnv->UCharErrorBufferLength=1;
				945	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				946	}
				947	count=0;
				948	c=0;
				949	break;
				950	} else {
				951	/* unmatched lead surrogate, handle here for consistent toUBytes[] */
				952	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				953
				954	/* back out reading the code unit after it */
				955	if(((const uint8_t *)pArgs->source-source)>=2) {
				956	source-=2;
				957	} else {
				958	/*
				959	* if the trail unit's first byte was in a previous buffer, then
				960	* we need to put it into a special place because toUBytes[] will be
				961	* used for the lead unit's bytes
				962	*/
				963	cnv->toUnicodeStatus=0x100\|p[2];
				964	--source;
				965	}
				966	cnv->toULength=2;
				967
				968	/* write back the updated pointers */
				969	pArgs->source=(const char *)source;
				970	pArgs->target=target;
				971	pArgs->offsets=offsets;
				972	return;
				973	}
				974	}
				975	} while(length>0);
				976	cnv->toULength=(int8_t)count;
				977	}
				978
				979	/* copy an even number of bytes for complete UChars */
				980	count=2*targetCapacity;
				981	if(count>length) {
				982	count=length&~1;
				983	}
				984	if(c==0 && count>0) {
				985	length-=count;
				986	count>>=1;
				987	targetCapacity-=count;
				988	if(offsets==NULL) {
				989	do {
				990	c=((UChar)source[1]<<8)\|source[0];
				991	source+=2;
				992	if(U16_IS_SINGLE(c)) {
				993	*target++=c;
				994	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
				995	U16_IS_TRAIL(trail=((UChar)source[1]<<8)\|source[0])
				996	) {
				997	source+=2;
				998	--count;
				999	*target++=c;
				1000	*target++=trail;
				1001	} else {
				1002	break;
				1003	}
				1004	} while(--count>0);
				1005	} else {
				1006	do {
				1007	c=((UChar)source[1]<<8)\|source[0];
				1008	source+=2;
				1009	if(U16_IS_SINGLE(c)) {
				1010	*target++=c;
				1011	*offsets++=sourceIndex;
				1012	sourceIndex+=2;
				1013	} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
				1014	U16_IS_TRAIL(trail=((UChar)source[1]<<8)\|source[0])
				1015	) {
				1016	source+=2;
				1017	--count;
				1018	*target++=c;
				1019	*target++=trail;
				1020	*offsets++=sourceIndex;
				1021	*offsets++=sourceIndex;
				1022	sourceIndex+=4;
				1023	} else {
				1024	break;
				1025	}
				1026	} while(--count>0);
				1027	}
				1028
				1029	if(count==0) {
				1030	/* done with the loop for complete UChars */
				1031	c=0;
				1032	} else {
				1033	/* keep c for surrogate handling, trail will be set there */
				1034	length+=2(count-1); / one more byte pair was consumed than count decremented */
				1035	targetCapacity+=count;
				1036	}
				1037	}
				1038
				1039	if(c!=0) {
				1040	/*
				1041	* c is a surrogate, and
				1042	* - source or target too short
				1043	* - or the surrogate is unmatched
				1044	*/
				1045	cnv->toUBytes[0]=(uint8_t)c;
				1046	cnv->toUBytes[1]=(uint8_t)(c>>8);
				1047	cnv->toULength=2;
				1048
				1049	if(U16_IS_SURROGATE_LEAD(c)) {
				1050	if(length>=2) {
				1051	if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)\|source[0])) {
				1052	/* output the surrogate pair, will overflow (see conditions comment above) */
				1053	source+=2;
				1054	length-=2;
				1055	*target++=c;
				1056	if(offsets!=NULL) {
				1057	*offsets++=sourceIndex;
				1058	}
				1059	cnv->UCharErrorBuffer[0]=trail;
				1060	cnv->UCharErrorBufferLength=1;
				1061	cnv->toULength=0;
				1062	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1063	} else {
				1064	/* unmatched lead surrogate */
				1065	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1066	}
				1067	} else {
				1068	/* see if the trail surrogate is in the next buffer */
				1069	}
				1070	} else {
				1071	/* unmatched trail surrogate */
				1072	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1073	}
				1074	}
				1075
				1076	if(U_SUCCESS(*pErrorCode)) {
				1077	/* check for a remaining source byte */
				1078	if(length>0) {
				1079	if(targetCapacity==0) {
				1080	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1081	} else {
				1082	/* it must be length==1 because otherwise the above would have copied more */
				1083	cnv->toUBytes[cnv->toULength++]=*source++;
				1084	}
				1085	}
				1086	}
				1087
				1088	/* write back the updated pointers */
				1089	pArgs->source=(const char *)source;
				1090	pArgs->target=target;
				1091	pArgs->offsets=offsets;
				1092	}
				1093
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1094	static UChar32 U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1095	_UTF16LEGetNextUChar(UConverterToUnicodeArgs pArgs, UErrorCode err) {
				1096	const uint8_t s, sourceLimit;
				1097	UChar32 c;
				1098
				1099	if(pArgs->converter->mode<8) {
				1100	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
				1101	}
				1102
				1103	s=(const uint8_t *)pArgs->source;
				1104	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				1105
				1106	if(s>=sourceLimit) {
				1107	/* no input */
				1108	*err=U_INDEX_OUTOFBOUNDS_ERROR;
				1109	return 0xffff;
				1110	}
				1111
				1112	if(s+2>sourceLimit) {
				1113	/* only one byte: truncated UChar */
				1114	pArgs->converter->toUBytes[0]=*s++;
				1115	pArgs->converter->toULength=1;
				1116	pArgs->source=(const char *)s;
				1117	*err = U_TRUNCATED_CHAR_FOUND;
				1118	return 0xffff;
				1119	}
				1120
				1121	/* get one UChar */
				1122	c=((UChar32)s[1]<<8)\|*s;
				1123	s+=2;
				1124
				1125	/* check for a surrogate pair */
				1126	if(U_IS_SURROGATE(c)) {
				1127	if(U16_IS_SURROGATE_LEAD(c)) {
				1128	if(s+2<=sourceLimit) {
				1129	UChar trail;
				1130
				1131	/* get a second UChar and see if it is a trail surrogate */
				1132	trail=((UChar)s[1]<<8)\|*s;
				1133	if(U16_IS_TRAIL(trail)) {
				1134	c=U16_GET_SUPPLEMENTARY(c, trail);
				1135	s+=2;
				1136	} else {
				1137	/* unmatched lead surrogate */
				1138	c=-2;
				1139	}
				1140	} else {
				1141	/* too few (2 or 3) bytes for a surrogate pair: truncated code point */
				1142	uint8_t *bytes=pArgs->converter->toUBytes;
				1143	s-=2;
				1144	pArgs->converter->toULength=(int8_t)(sourceLimit-s);
				1145	do {
				1146	bytes++=s++;
				1147	} while(s<sourceLimit);
				1148
				1149	c=0xffff;
				1150	*err=U_TRUNCATED_CHAR_FOUND;
				1151	}
				1152	} else {
				1153	/* unmatched trail surrogate */
				1154	c=-2;
				1155	}
				1156
				1157	if(c<0) {
				1158	/* write the unmatched surrogate */
				1159	uint8_t *bytes=pArgs->converter->toUBytes;
				1160	pArgs->converter->toULength=2;
				1161	bytes=(s-2);
				1162	bytes[1]=*(s-1);
				1163
				1164	c=0xffff;
				1165	*err=U_ILLEGAL_CHAR_FOUND;
				1166	}
				1167	}
				1168
				1169	pArgs->source=(const char *)s;
				1170	return c;
				1171	}
				1172
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1173	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1174	_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
				1175	if(choice<=UCNV_RESET_TO_UNICODE) {
				1176	/* reset toUnicode state */
				1177	if(UCNV_GET_VERSION(cnv)==0) {
				1178	cnv->mode=8; /* no BOM handling */
				1179	} else {
				1180	cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
				1181	}
				1182	}
				1183	if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
				1184	/* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
				1185	cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
				1186	}
				1187	}
				1188
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1189	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1190	_UTF16LEOpen(UConverter *cnv,
				1191	UConverterLoadArgs *pArgs,
				1192	UErrorCode *pErrorCode) {
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1193	(void)pArgs;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1194	if(UCNV_GET_VERSION(cnv)<=1) {
				1195	_UTF16LEReset(cnv, UCNV_RESET_BOTH);
				1196	} else {
				1197	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1198	}
				1199	}
				1200
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1201	static const char * U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1202	_UTF16LEGetName(const UConverter *cnv) {
				1203	if(UCNV_GET_VERSION(cnv)==0) {
				1204	return "UTF-16LE";
				1205	} else {
				1206	return "UTF-16LE,version=1";
				1207	}
				1208	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1209	U_CDECL_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1210
				1211	static const UConverterImpl _UTF16LEImpl={
				1212	UCNV_UTF16_LittleEndian,
				1213
				1214	NULL,
				1215	NULL,
				1216
				1217	_UTF16LEOpen,
				1218	NULL,
				1219	_UTF16LEReset,
				1220
				1221	_UTF16LEToUnicodeWithOffsets,
				1222	_UTF16LEToUnicodeWithOffsets,
				1223	_UTF16LEFromUnicodeWithOffsets,
				1224	_UTF16LEFromUnicodeWithOffsets,
				1225	_UTF16LEGetNextUChar,
				1226
				1227	NULL,
				1228	_UTF16LEGetName,
				1229	NULL,
				1230	NULL,
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1231	ucnv_getNonSurrogateUnicodeSet,
				1232
				1233	NULL,
				1234	NULL
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1235	};
				1236
				1237
				1238	static const UConverterStaticData _UTF16LEStaticData={
				1239	sizeof(UConverterStaticData),
				1240	"UTF-16LE",
				1241	1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1242	{ 0xfd, 0xff, 0, 0 },2,false,false,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1243	0,
				1244	0,
				1245	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1246	};
				1247
				1248
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	1249	const UConverterSharedData _UTF16LEData=
				1250	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1251
				1252	/* UTF-16 (Detect BOM) ------------------------------------------------------ */
				1253
				1254	/*
				1255	* Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
				1256	* accordingly.
				1257	* This is a simpler version of the UTF-32 converter, with
				1258	* fewer states for shorter BOMs.
				1259	*
				1260	* State values:
				1261	* 0 initial state
				1262	* 1 saw first byte
				1263	* 2..5 -
				1264	* 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
				1265	* 8 UTF-16BE mode
				1266	* 9 UTF-16LE mode
				1267	*
				1268	* During detection: state==number of initial bytes seen so far.
				1269	*
				1270	* On output, emit U+FEFF as the first code point.
				1271	*
				1272	* Variants:
				1273	* - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
				1274	* - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
				1275	* UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
				1276	*/
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1277	U_CDECL_BEGIN
				1278	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1279	_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
				1280	if(choice<=UCNV_RESET_TO_UNICODE) {
				1281	/* reset toUnicode: state=0 */
				1282	cnv->mode=0;
				1283	}
				1284	if(choice!=UCNV_RESET_TO_UNICODE) {
				1285	/* reset fromUnicode: prepare to output the UTF-16PE BOM */
				1286	cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
				1287	}
				1288	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1289	U_CDECL_END
				1290	extern const UConverterSharedData _UTF16v2Data;
				1291	U_CDECL_BEGIN
				1292	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1293	_UTF16Open(UConverter *cnv,
				1294	UConverterLoadArgs *pArgs,
				1295	UErrorCode *pErrorCode) {
				1296	if(UCNV_GET_VERSION(cnv)<=2) {
				1297	if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
				1298	/*
				1299	* Switch implementation, and switch the staticData that's different
				1300	* and was copied into the UConverter.
				1301	* (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
				1302	* UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
				1303	*/
				1304	cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
				1305	uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
				1306	}
				1307	_UTF16Reset(cnv, UCNV_RESET_BOTH);
				1308	} else {
				1309	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				1310	}
				1311	}
				1312
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1313	static const char * U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1314	_UTF16GetName(const UConverter *cnv) {
				1315	if(UCNV_GET_VERSION(cnv)==0) {
				1316	return "UTF-16";
				1317	} else if(UCNV_GET_VERSION(cnv)==1) {
				1318	return "UTF-16,version=1";
				1319	} else {
				1320	return "UTF-16,version=2";
				1321	}
				1322	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1323	U_CDECL_END
				1324	extern const UConverterSharedData _UTF16Data;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1325
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	1326	static inline bool IS_UTF16BE(const UConverter *cnv) {
				1327	return ((cnv)->sharedData == &_UTF16BEData);
				1328	}
				1329
				1330	static inline bool IS_UTF16LE(const UConverter *cnv) {
				1331	return ((cnv)->sharedData == &_UTF16LEData);
				1332	}
				1333
				1334	static inline bool IS_UTF16(const UConverter *cnv) {
				1335	return ((cnv)->sharedData==&_UTF16Data) \|\| ((cnv)->sharedData == &_UTF16v2Data);
				1336	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1337
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1338	U_CDECL_BEGIN
				1339	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1340	_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				1341	UErrorCode *pErrorCode) {
				1342	UConverter *cnv=pArgs->converter;
				1343	const char *source=pArgs->source;
				1344	const char *sourceLimit=pArgs->sourceLimit;
				1345	int32_t *offsets=pArgs->offsets;
				1346
				1347	int32_t state, offsetDelta;
				1348	uint8_t b;
				1349
				1350	state=cnv->mode;
				1351
				1352	/*
				1353	* If we detect a BOM in this buffer, then we must add the BOM size to the
				1354	* offsets because the actual converter function will not see and count the BOM.
				1355	* offsetDelta will have the number of the BOM bytes that are in the current buffer.
				1356	*/
				1357	offsetDelta=0;
				1358
				1359	while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
				1360	switch(state) {
				1361	case 0:
				1362	cnv->toUBytes[0]=(uint8_t)*source++;
				1363	cnv->toULength=1;
				1364	state=1;
				1365	break;
				1366	case 1:
				1367	/*
				1368	* Only inside this switch case can the state variable
				1369	* temporarily take two additional values:
				1370	* 6: BOM error, continue with BE
				1371	* 7: BOM error, continue with LE
				1372	*/
				1373	b=*source;
				1374	if(cnv->toUBytes[0]==0xfe && b==0xff) {
				1375	if(IS_UTF16LE(cnv)) {
				1376	state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
				1377	} else {
				1378	state=8; /* detect UTF-16BE */
				1379	}
				1380	} else if(cnv->toUBytes[0]==0xff && b==0xfe) {
				1381	if(IS_UTF16BE(cnv)) {
				1382	state=6; /* illegal reverse BOM for Java "UnicodeBig" */
				1383	} else {
				1384	state=9; /* detect UTF-16LE */
				1385	}
				1386	} else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
				1387	state=6; /* illegal missing BOM for Java "Unicode" */
				1388	}
				1389	if(state>=8) {
				1390	/* BOM detected, consume it */
				1391	++source;
				1392	cnv->toULength=0;
				1393	offsetDelta=(int32_t)(source-pArgs->source);
				1394	} else if(state<6) {
				1395	/* ok: no BOM, and not a reverse BOM */
				1396	if(source!=pArgs->source) {
				1397	/* reset the source for a correct first offset */
				1398	source=pArgs->source;
				1399	cnv->toULength=0;
				1400	}
				1401	if(IS_UTF16LE(cnv)) {
				1402	/* Make Java "UnicodeLittle" default to LE. */
				1403	state=9;
				1404	} else {
				1405	/* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
				1406	state=8;
				1407	}
				1408	} else {
				1409	/*
				1410	* error: missing BOM, or reverse BOM
				1411	* UTF-16,version=1: Java-specific "Unicode" requires a BOM.
				1412	* UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
				1413	* UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
				1414	*/
				1415	/* report the non-BOM or reverse BOM as an illegal sequence */
				1416	cnv->toUBytes[1]=b;
				1417	cnv->toULength=2;
				1418	pArgs->source=source+1;
				1419	/* continue with conversion if the callback resets the error */
				1420	/*
				1421	* Make Java "Unicode" default to BE like standard UTF-16.
				1422	* Make Java "UnicodeBig" and "UnicodeLittle" default
				1423	* to their normal endiannesses.
				1424	*/
				1425	cnv->mode=state+2;
				1426	*pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
				1427	return;
				1428	}
				1429	/* convert the rest of the stream */
				1430	cnv->mode=state;
				1431	continue;
				1432	case 8:
				1433	/* call UTF-16BE */
				1434	pArgs->source=source;
				1435	_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
				1436	source=pArgs->source;
				1437	break;
				1438	case 9:
				1439	/* call UTF-16LE */
				1440	pArgs->source=source;
				1441	_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
				1442	source=pArgs->source;
				1443	break;
				1444	default:
				1445	break; /* does not occur */
				1446	}
				1447	}
				1448
				1449	/* add BOM size to offsets - see comment at offsetDelta declaration */
				1450	if(offsets!=NULL && offsetDelta!=0) {
				1451	int32_t *offsetsLimit=pArgs->offsets;
				1452	while(offsets<offsetsLimit) {
				1453	*offsets++ += offsetDelta;
				1454	}
				1455	}
				1456
				1457	pArgs->source=source;
				1458
				1459	if(source==sourceLimit && pArgs->flush) {
				1460	/* handle truncated input */
				1461	switch(state) {
				1462	case 0:
				1463	break; /* no input at all, nothing to do */
				1464	case 8:
				1465	_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
				1466	break;
				1467	case 9:
				1468	_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
				1469	break;
				1470	default:
				1471	/* 0<state<8: framework will report truncation, nothing to do here */
				1472	break;
				1473	}
				1474	}
				1475
				1476	cnv->mode=state;
				1477	}
				1478
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1479	static UChar32 U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1480	_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
				1481	UErrorCode *pErrorCode) {
				1482	switch(pArgs->converter->mode) {
				1483	case 8:
				1484	return _UTF16BEGetNextUChar(pArgs, pErrorCode);
				1485	case 9:
				1486	return _UTF16LEGetNextUChar(pArgs, pErrorCode);
				1487	default:
				1488	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
				1489	}
				1490	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1491	U_CDECL_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1492
				1493	static const UConverterImpl _UTF16Impl = {
				1494	UCNV_UTF16,
				1495
				1496	NULL,
				1497	NULL,
				1498
				1499	_UTF16Open,
				1500	NULL,
				1501	_UTF16Reset,
				1502
				1503	_UTF16ToUnicodeWithOffsets,
				1504	_UTF16ToUnicodeWithOffsets,
				1505	_UTF16PEFromUnicodeWithOffsets,
				1506	_UTF16PEFromUnicodeWithOffsets,
				1507	_UTF16GetNextUChar,
				1508
				1509	NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
				1510	_UTF16GetName,
				1511	NULL,
				1512	NULL,
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1513	ucnv_getNonSurrogateUnicodeSet,
				1514
				1515	NULL,
				1516	NULL
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1517	};
				1518
				1519	static const UConverterStaticData _UTF16StaticData = {
				1520	sizeof(UConverterStaticData),
				1521	"UTF-16",
				1522	1204, /* CCSID for BOM sensitive UTF-16 */
				1523	UCNV_IBM, UCNV_UTF16, 2, 2,
				1524	#if U_IS_BIG_ENDIAN
				1525	{ 0xff, 0xfd, 0, 0 }, 2,
				1526	#else
				1527	{ 0xfd, 0xff, 0, 0 }, 2,
				1528	#endif
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1529	false, false,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1530	0,
				1531	0,
				1532	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1533	};
				1534
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	1535	const UConverterSharedData _UTF16Data =
				1536	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1537
				1538	static const UConverterImpl _UTF16v2Impl = {
				1539	UCNV_UTF16,
				1540
				1541	NULL,
				1542	NULL,
				1543
				1544	_UTF16Open,
				1545	NULL,
				1546	_UTF16Reset,
				1547
				1548	_UTF16ToUnicodeWithOffsets,
				1549	_UTF16ToUnicodeWithOffsets,
				1550	_UTF16BEFromUnicodeWithOffsets,
				1551	_UTF16BEFromUnicodeWithOffsets,
				1552	_UTF16GetNextUChar,
				1553
				1554	NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
				1555	_UTF16GetName,
				1556	NULL,
				1557	NULL,
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1558	ucnv_getNonSurrogateUnicodeSet,
				1559
				1560	NULL,
				1561	NULL
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1562	};
				1563
				1564	static const UConverterStaticData _UTF16v2StaticData = {
				1565	sizeof(UConverterStaticData),
				1566	"UTF-16,version=2",
				1567	1204, /* CCSID for BOM sensitive UTF-16 */
				1568	UCNV_IBM, UCNV_UTF16, 2, 2,
				1569	{ 0xff, 0xfd, 0, 0 }, 2,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1570	false, false,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1571	0,
				1572	0,
				1573	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1574	};
				1575
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1576	const UConverterSharedData _UTF16v2Data =
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	1577	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1578
				1579	#endif