Blame - source/common/ucnv_u8.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 951988ed9ca3d1665146aee054b43fd493cfb283 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	**********************************************************************
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	5	* Copyright (C) 2002-2016, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	* file name: ucnv_u8.c
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	9	* encoding: UTF-8
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	10	* tab size: 8 (not used)
				11	* indentation:4
				12	*
				13	* created on: 2002jul01
				14	* created by: Markus W. Scherer
				15	*
				16	* UTF-8 converter implementation. Used to be in ucnv_utf.c.
				17	*
				18	* Also, CESU-8 implementation, see UTR 26.
				19	* The CESU-8 converter uses all the same functions as the
				20	* UTF-8 converter, with a branch for converting supplementary code points.
				21	*/
				22
				23	#include "unicode/utypes.h"
				24
				25	#if !UCONFIG_NO_CONVERSION
				26
				27	#include "unicode/ucnv.h"
				28	#include "unicode/utf.h"
				29	#include "unicode/utf8.h"
				30	#include "unicode/utf16.h"
				31	#include "ucnv_bld.h"
				32	#include "ucnv_cnv.h"
				33	#include "cmemory.h"
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	34	#include "ustr_imp.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	35
				36	/* Prototypes --------------------------------------------------------------- */
				37
				38	/* Keep these here to make finicky compilers happy */
				39
				40	U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
				41	UErrorCode *err);
				42	U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
				43	UErrorCode *err);
				44
				45
				46	/* UTF-8 -------------------------------------------------------------------- */
				47
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	48	#define MAXIMUM_UCS2 0x0000FFFF
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	49
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	50	static const uint32_t offsetsFromUTF8[5] = {0,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	51	(uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	52	(uint32_t) 0x03C82080
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	53	};
				54
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	55	static UBool hasCESU8Data(const UConverter *cnv)
				56	{
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	57	#if UCONFIG_ONLY_HTML_CONVERSION
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	58	return FALSE;
				59	#else
				60	return (UBool)(cnv->sharedData == &_CESU8Data);
				61	#endif
				62	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	63	U_CDECL_BEGIN
				64	static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	65	UErrorCode * err)
				66	{
				67	UConverter *cnv = args->converter;
				68	const unsigned char mySource = (unsigned char ) args->source;
				69	UChar *myTarget = args->target;
				70	const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;
				71	const UChar *targetLimit = args->targetLimit;
				72	unsigned char *toUBytes = cnv->toUBytes;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	73	UBool isCESU8 = hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	74	uint32_t ch, ch2 = 0;
				75	int32_t i, inBytes;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	76
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	77	/* Restore size of current sequence */
				78	if (cnv->toUnicodeStatus && myTarget < targetLimit)
				79	{
				80	inBytes = cnv->mode; /* restore # of bytes to consume */
				81	i = cnv->toULength; /* restore # of bytes consumed */
				82	cnv->toULength = 0;
				83
				84	ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a previous call/
				85	cnv->toUnicodeStatus = 0;
				86	goto morebytes;
				87	}
				88
				89
				90	while (mySource < sourceLimit && myTarget < targetLimit)
				91	{
				92	ch = *(mySource++);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	93	if (U8_IS_SINGLE(ch)) /* Simple case */
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	94	{
				95	*(myTarget++) = (UChar) ch;
				96	}
				97	else
				98	{
				99	/* store the first char */
				100	toUBytes[0] = (char)ch;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	101	inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	102	i = 1;
				103
				104	morebytes:
				105	while (i < inBytes)
				106	{
				107	if (mySource < sourceLimit)
				108	{
				109	toUBytes[i] = (char) (ch2 = *mySource);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	110	if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
				111	!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	112	{
				113	break; /* i < inBytes */
				114	}
				115	ch = (ch << 6) + ch2;
				116	++mySource;
				117	i++;
				118	}
				119	else
				120	{
				121	/* stores a partially calculated target*/
				122	cnv->toUnicodeStatus = ch;
				123	cnv->mode = inBytes;
				124	cnv->toULength = (int8_t) i;
				125	goto donefornow;
				126	}
				127	}
				128
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	129	// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
				130	if (i == inBytes && (!isCESU8 \|\| i <= 3))
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	131	{
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	132	/* Remove the accumulated high bits */
				133	ch -= offsetsFromUTF8[inBytes];
				134
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	135	/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
				136	if (ch <= MAXIMUM_UCS2)
				137	{
				138	/* fits in 16 bits */
				139	*(myTarget++) = (UChar) ch;
				140	}
				141	else
				142	{
				143	/* write out the surrogates */
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	144	*(myTarget++) = U16_LEAD(ch);
				145	ch = U16_TRAIL(ch);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	146	if (myTarget < targetLimit)
				147	{
				148	*(myTarget++) = (UChar)ch;
				149	}
				150	else
				151	{
				152	/* Put in overflow buffer (not handled here) */
				153	cnv->UCharErrorBuffer[0] = (UChar) ch;
				154	cnv->UCharErrorBufferLength = 1;
				155	*err = U_BUFFER_OVERFLOW_ERROR;
				156	break;
				157	}
				158	}
				159	}
				160	else
				161	{
				162	cnv->toULength = (int8_t)i;
				163	*err = U_ILLEGAL_CHAR_FOUND;
				164	break;
				165	}
				166	}
				167	}
				168
				169	donefornow:
				170	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				171	{
				172	/* End of target buffer */
				173	*err = U_BUFFER_OVERFLOW_ERROR;
				174	}
				175
				176	args->target = myTarget;
				177	args->source = (const char *) mySource;
				178	}
				179
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	180	static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	181	UErrorCode * err)
				182	{
				183	UConverter *cnv = args->converter;
				184	const unsigned char mySource = (unsigned char ) args->source;
				185	UChar *myTarget = args->target;
				186	int32_t *myOffsets = args->offsets;
				187	int32_t offsetNum = 0;
				188	const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;
				189	const UChar *targetLimit = args->targetLimit;
				190	unsigned char *toUBytes = cnv->toUBytes;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	191	UBool isCESU8 = hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	192	uint32_t ch, ch2 = 0;
				193	int32_t i, inBytes;
				194
				195	/* Restore size of current sequence */
				196	if (cnv->toUnicodeStatus && myTarget < targetLimit)
				197	{
				198	inBytes = cnv->mode; /* restore # of bytes to consume */
				199	i = cnv->toULength; /* restore # of bytes consumed */
				200	cnv->toULength = 0;
				201
				202	ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a previous call/
				203	cnv->toUnicodeStatus = 0;
				204	goto morebytes;
				205	}
				206
				207	while (mySource < sourceLimit && myTarget < targetLimit)
				208	{
				209	ch = *(mySource++);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	210	if (U8_IS_SINGLE(ch)) /* Simple case */
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	211	{
				212	*(myTarget++) = (UChar) ch;
				213	*(myOffsets++) = offsetNum++;
				214	}
				215	else
				216	{
				217	toUBytes[0] = (char)ch;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	218	inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	219	i = 1;
				220
				221	morebytes:
				222	while (i < inBytes)
				223	{
				224	if (mySource < sourceLimit)
				225	{
				226	toUBytes[i] = (char) (ch2 = *mySource);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	227	if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
				228	!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	229	{
				230	break; /* i < inBytes */
				231	}
				232	ch = (ch << 6) + ch2;
				233	++mySource;
				234	i++;
				235	}
				236	else
				237	{
				238	cnv->toUnicodeStatus = ch;
				239	cnv->mode = inBytes;
				240	cnv->toULength = (int8_t)i;
				241	goto donefornow;
				242	}
				243	}
				244
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	245	// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
				246	if (i == inBytes && (!isCESU8 \|\| i <= 3))
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	247	{
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	248	/* Remove the accumulated high bits */
				249	ch -= offsetsFromUTF8[inBytes];
				250
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	251	/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
				252	if (ch <= MAXIMUM_UCS2)
				253	{
				254	/* fits in 16 bits */
				255	*(myTarget++) = (UChar) ch;
				256	*(myOffsets++) = offsetNum;
				257	}
				258	else
				259	{
				260	/* write out the surrogates */
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	261	*(myTarget++) = U16_LEAD(ch);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	262	*(myOffsets++) = offsetNum;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	263	ch = U16_TRAIL(ch);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	264	if (myTarget < targetLimit)
				265	{
				266	*(myTarget++) = (UChar)ch;
				267	*(myOffsets++) = offsetNum;
				268	}
				269	else
				270	{
				271	cnv->UCharErrorBuffer[0] = (UChar) ch;
				272	cnv->UCharErrorBufferLength = 1;
				273	*err = U_BUFFER_OVERFLOW_ERROR;
				274	}
				275	}
				276	offsetNum += i;
				277	}
				278	else
				279	{
				280	cnv->toULength = (int8_t)i;
				281	*err = U_ILLEGAL_CHAR_FOUND;
				282	break;
				283	}
				284	}
				285	}
				286
				287	donefornow:
				288	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				289	{ /* End of target buffer */
				290	*err = U_BUFFER_OVERFLOW_ERROR;
				291	}
				292
				293	args->target = myTarget;
				294	args->source = (const char *) mySource;
				295	args->offsets = myOffsets;
				296	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	297	U_CDECL_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	298
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	299	U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	300	UErrorCode * err)
				301	{
				302	UConverter *cnv = args->converter;
				303	const UChar *mySource = args->source;
				304	const UChar *sourceLimit = args->sourceLimit;
				305	uint8_t myTarget = (uint8_t ) args->target;
				306	const uint8_t targetLimit = (uint8_t ) args->targetLimit;
				307	uint8_t *tempPtr;
				308	UChar32 ch;
				309	uint8_t tempBuf[4];
				310	int32_t indexToWrite;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	311	UBool isNotCESU8 = !hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	312
				313	if (cnv->fromUChar32 && myTarget < targetLimit)
				314	{
				315	ch = cnv->fromUChar32;
				316	cnv->fromUChar32 = 0;
				317	goto lowsurrogate;
				318	}
				319
				320	while (mySource < sourceLimit && myTarget < targetLimit)
				321	{
				322	ch = *(mySource++);
				323
				324	if (ch < 0x80) /* Single byte */
				325	{
				326	*(myTarget++) = (uint8_t) ch;
				327	}
				328	else if (ch < 0x800) /* Double byte */
				329	{
				330	*(myTarget++) = (uint8_t) ((ch >> 6) \| 0xc0);
				331	if (myTarget < targetLimit)
				332	{
				333	*(myTarget++) = (uint8_t) ((ch & 0x3f) \| 0x80);
				334	}
				335	else
				336	{
				337	cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) \| 0x80);
				338	cnv->charErrorBufferLength = 1;
				339	*err = U_BUFFER_OVERFLOW_ERROR;
				340	}
				341	}
				342	else {
				343	/* Check for surrogates */
				344	if(U16_IS_SURROGATE(ch) && isNotCESU8) {
				345	lowsurrogate:
				346	if (mySource < sourceLimit) {
				347	/* test both code units */
				348	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
				349	/* convert and consume this supplementary code point */
				350	ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
				351	++mySource;
				352	/* exit this condition tree */
				353	}
				354	else {
				355	/* this is an unpaired trail or lead code unit */
				356	/* callback(illegal) */
				357	cnv->fromUChar32 = ch;
				358	*err = U_ILLEGAL_CHAR_FOUND;
				359	break;
				360	}
				361	}
				362	else {
				363	/* no more input */
				364	cnv->fromUChar32 = ch;
				365	break;
				366	}
				367	}
				368
				369	/* Do we write the buffer directly for speed,
				370	or do we have to be careful about target buffer space? */
				371	tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
				372
				373	if (ch <= MAXIMUM_UCS2) {
				374	indexToWrite = 2;
				375	tempPtr[0] = (uint8_t) ((ch >> 12) \| 0xe0);
				376	}
				377	else {
				378	indexToWrite = 3;
				379	tempPtr[0] = (uint8_t) ((ch >> 18) \| 0xf0);
				380	tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) \| 0x80);
				381	}
				382	tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) \| 0x80);
				383	tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) \| 0x80);
				384
				385	if (tempPtr == myTarget) {
				386	/* There was enough space to write the codepoint directly. */
				387	myTarget += (indexToWrite + 1);
				388	}
				389	else {
				390	/* We might run out of room soon. Write it slowly. */
				391	for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
				392	if (myTarget < targetLimit) {
				393	(myTarget++) = tempPtr;
				394	}
				395	else {
				396	cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
				397	*err = U_BUFFER_OVERFLOW_ERROR;
				398	}
				399	}
				400	}
				401	}
				402	}
				403
				404	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				405	{
				406	*err = U_BUFFER_OVERFLOW_ERROR;
				407	}
				408
				409	args->target = (char *) myTarget;
				410	args->source = mySource;
				411	}
				412
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	413	U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	414	UErrorCode * err)
				415	{
				416	UConverter *cnv = args->converter;
				417	const UChar *mySource = args->source;
				418	int32_t *myOffsets = args->offsets;
				419	const UChar *sourceLimit = args->sourceLimit;
				420	uint8_t myTarget = (uint8_t ) args->target;
				421	const uint8_t targetLimit = (uint8_t ) args->targetLimit;
				422	uint8_t *tempPtr;
				423	UChar32 ch;
				424	int32_t offsetNum, nextSourceIndex;
				425	int32_t indexToWrite;
				426	uint8_t tempBuf[4];
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	427	UBool isNotCESU8 = !hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	428
				429	if (cnv->fromUChar32 && myTarget < targetLimit)
				430	{
				431	ch = cnv->fromUChar32;
				432	cnv->fromUChar32 = 0;
				433	offsetNum = -1;
				434	nextSourceIndex = 0;
				435	goto lowsurrogate;
				436	} else {
				437	offsetNum = 0;
				438	}
				439
				440	while (mySource < sourceLimit && myTarget < targetLimit)
				441	{
				442	ch = *(mySource++);
				443
				444	if (ch < 0x80) /* Single byte */
				445	{
				446	*(myOffsets++) = offsetNum++;
				447	*(myTarget++) = (char) ch;
				448	}
				449	else if (ch < 0x800) /* Double byte */
				450	{
				451	*(myOffsets++) = offsetNum;
				452	*(myTarget++) = (uint8_t) ((ch >> 6) \| 0xc0);
				453	if (myTarget < targetLimit)
				454	{
				455	*(myOffsets++) = offsetNum++;
				456	*(myTarget++) = (uint8_t) ((ch & 0x3f) \| 0x80);
				457	}
				458	else
				459	{
				460	cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) \| 0x80);
				461	cnv->charErrorBufferLength = 1;
				462	*err = U_BUFFER_OVERFLOW_ERROR;
				463	}
				464	}
				465	else
				466	/* Check for surrogates */
				467	{
				468	nextSourceIndex = offsetNum + 1;
				469
				470	if(U16_IS_SURROGATE(ch) && isNotCESU8) {
				471	lowsurrogate:
				472	if (mySource < sourceLimit) {
				473	/* test both code units */
				474	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
				475	/* convert and consume this supplementary code point */
				476	ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
				477	++mySource;
				478	++nextSourceIndex;
				479	/* exit this condition tree */
				480	}
				481	else {
				482	/* this is an unpaired trail or lead code unit */
				483	/* callback(illegal) */
				484	cnv->fromUChar32 = ch;
				485	*err = U_ILLEGAL_CHAR_FOUND;
				486	break;
				487	}
				488	}
				489	else {
				490	/* no more input */
				491	cnv->fromUChar32 = ch;
				492	break;
				493	}
				494	}
				495
				496	/* Do we write the buffer directly for speed,
				497	or do we have to be careful about target buffer space? */
				498	tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
				499
				500	if (ch <= MAXIMUM_UCS2) {
				501	indexToWrite = 2;
				502	tempPtr[0] = (uint8_t) ((ch >> 12) \| 0xe0);
				503	}
				504	else {
				505	indexToWrite = 3;
				506	tempPtr[0] = (uint8_t) ((ch >> 18) \| 0xf0);
				507	tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) \| 0x80);
				508	}
				509	tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) \| 0x80);
				510	tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) \| 0x80);
				511
				512	if (tempPtr == myTarget) {
				513	/* There was enough space to write the codepoint directly. */
				514	myTarget += (indexToWrite + 1);
				515	myOffsets[0] = offsetNum;
				516	myOffsets[1] = offsetNum;
				517	myOffsets[2] = offsetNum;
				518	if (indexToWrite >= 3) {
				519	myOffsets[3] = offsetNum;
				520	}
				521	myOffsets += (indexToWrite + 1);
				522	}
				523	else {
				524	/* We might run out of room soon. Write it slowly. */
				525	for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
				526	if (myTarget < targetLimit)
				527	{
				528	*(myOffsets++) = offsetNum;
				529	(myTarget++) = tempPtr;
				530	}
				531	else
				532	{
				533	cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
				534	*err = U_BUFFER_OVERFLOW_ERROR;
				535	}
				536	}
				537	}
				538	offsetNum = nextSourceIndex;
				539	}
				540	}
				541
				542	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				543	{
				544	*err = U_BUFFER_OVERFLOW_ERROR;
				545	}
				546
				547	args->target = (char *) myTarget;
				548	args->source = mySource;
				549	args->offsets = myOffsets;
				550	}
				551
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	552	U_CDECL_BEGIN
				553	static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	554	UErrorCode *err) {
				555	UConverter *cnv;
				556	const uint8_t *sourceInitial;
				557	const uint8_t *source;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	558	uint8_t myByte;
				559	UChar32 ch;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	560	int8_t i;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	561
				562	/* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
				563
				564	cnv = args->converter;
				565	sourceInitial = source = (const uint8_t *)args->source;
				566	if (source >= (const uint8_t *)args->sourceLimit)
				567	{
				568	/* no input */
				569	*err = U_INDEX_OUTOFBOUNDS_ERROR;
				570	return 0xffff;
				571	}
				572
				573	myByte = (uint8_t)*(source++);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	574	if (U8_IS_SINGLE(myByte))
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	575	{
				576	args->source = (const char *)source;
				577	return (UChar32)myByte;
				578	}
				579
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	580	uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
				581	if (countTrailBytes == 0) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	582	cnv->toUBytes[0] = myByte;
				583	cnv->toULength = 1;
				584	*err = U_ILLEGAL_CHAR_FOUND;
				585	args->source = (const char *)source;
				586	return 0xffff;
				587	}
				588
				589	/The byte sequence is longer than the buffer area passed/
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	590	if (((const char *)source + countTrailBytes) > args->sourceLimit)
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	591	{
				592	/* check if all of the remaining bytes are trail bytes */
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	593	uint16_t extraBytesToWrite = countTrailBytes + 1;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	594	cnv->toUBytes[0] = myByte;
				595	i = 1;
				596	*err = U_TRUNCATED_CHAR_FOUND;
				597	while(source < (const uint8_t *)args->sourceLimit) {
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	598	uint8_t b = *source;
				599	if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
				600	cnv->toUBytes[i++] = b;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	601	++source;
				602	} else {
				603	/* error even before we run out of input */
				604	*err = U_ILLEGAL_CHAR_FOUND;
				605	break;
				606	}
				607	}
				608	cnv->toULength = i;
				609	args->source = (const char *)source;
				610	return 0xffff;
				611	}
				612
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	613	ch = myByte << 6;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	614	if(countTrailBytes == 2) {
				615	uint8_t t1 = *source, t2;
				616	if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
				617	args->source = (const char *)(source + 1);
				618	return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	619	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	620	} else if(countTrailBytes == 1) {
				621	uint8_t t1 = *source;
				622	if(U8_IS_TRAIL(t1)) {
				623	args->source = (const char *)(source + 1);
				624	return (ch + t1) - offsetsFromUTF8[2];
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	625	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	626	} else { // countTrailBytes == 3
				627	uint8_t t1 = *source, t2, t3;
				628	if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
				629	U8_IS_TRAIL(t3 = *++source)) {
				630	args->source = (const char *)(source + 1);
				631	return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	632	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	633	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	634	args->source = (const char *)source;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	635
				636	for(i = 0; sourceInitial < source; ++i) {
				637	cnv->toUBytes[i] = *sourceInitial++;
				638	}
				639	cnv->toULength = i;
				640	*err = U_ILLEGAL_CHAR_FOUND;
				641	return 0xffff;
				642	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	643	U_CDECL_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	644
				645	/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
				646
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	647	U_CDECL_BEGIN
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	648	/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	649	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	650	ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
				651	UConverterToUnicodeArgs *pToUArgs,
				652	UErrorCode *pErrorCode) {
				653	UConverter *utf8;
				654	const uint8_t source, sourceLimit;
				655	uint8_t *target;
				656	int32_t targetCapacity;
				657	int32_t count;
				658
				659	int8_t oldToULength, toULength, toULimit;
				660
				661	UChar32 c;
				662	uint8_t b, t1, t2;
				663
				664	/* set up the local pointers */
				665	utf8=pToUArgs->converter;
				666	source=(uint8_t *)pToUArgs->source;
				667	sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
				668	target=(uint8_t *)pFromUArgs->target;
				669	targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
				670
				671	/* get the converter state from the UTF-8 UConverter */
				672	c=(UChar32)utf8->toUnicodeStatus;
				673	if(c!=0) {
				674	toULength=oldToULength=utf8->toULength;
				675	toULimit=(int8_t)utf8->mode;
				676	} else {
				677	toULength=oldToULength=toULimit=0;
				678	}
				679
				680	count=(int32_t)(sourceLimit-source)+oldToULength;
				681	if(count<toULimit) {
				682	/*
				683	* Not enough input to complete the partial character.
				684	* Jump to moreBytes below - it will not output to target.
				685	*/
				686	} else if(targetCapacity<toULimit) {
				687	/*
				688	* Not enough target capacity to output the partial character.
				689	* Let the standard converter handle this.
				690	*/
				691	*pErrorCode=U_USING_DEFAULT_WARNING;
				692	return;
				693	} else {
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	694	// Use a single counter for source and target, counting the minimum of
				695	// the source length and the target capacity.
				696	// Let the standard converter handle edge cases.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	697	if(count>targetCapacity) {
				698	count=targetCapacity;
				699	}
				700
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	701	// The conversion loop checks count>0 only once per 1/2/3-byte character.
				702	// If the buffer ends with a truncated 2- or 3-byte sequence,
				703	// then we reduce the count to stop before that,
				704	// and collect the remaining bytes after the conversion loop.
				705	{
				706	// Do not go back into the bytes that will be read for finishing a partial
				707	// sequence from the previous buffer.
				708	int32_t length=count-toULimit;
				709	if(length>0) {
				710	uint8_t b1=*(sourceLimit-1);
				711	if(U8_IS_SINGLE(b1)) {
				712	// common ASCII character
				713	} else if(U8_IS_TRAIL(b1) && length>=2) {
				714	uint8_t b2=*(sourceLimit-2);
				715	if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
				716	// truncated 3-byte sequence
				717	count-=2;
				718	}
				719	} else if(0xc2<=b1 && b1<0xf0) {
				720	// truncated 2- or 3-byte sequence
				721	--count;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	722	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	723	}
				724	}
				725	}
				726
				727	if(c!=0) {
				728	utf8->toUnicodeStatus=0;
				729	utf8->toULength=0;
				730	goto moreBytes;
				731	/* See note in ucnv_SBCSFromUTF8() about this goto. */
				732	}
				733
				734	/* conversion loop */
				735	while(count>0) {
				736	b=*source++;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	737	if(U8_IS_SINGLE(b)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	738	/* convert ASCII */
				739	*target++=b;
				740	--count;
				741	continue;
				742	} else {
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	743	if(b>=0xe0) {
				744	if( /* handle U+0800..U+FFFF inline */
				745	b<0xf0 &&
				746	U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
				747	U8_IS_TRAIL(t2=source[1])
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	748	) {
				749	source+=2;
				750	*target++=b;
				751	*target++=t1;
				752	*target++=t2;
				753	count-=3;
				754	continue;
				755	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	756	} else {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	757	if( /* handle U+0080..U+07FF inline */
				758	b>=0xc2 &&
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	759	U8_IS_TRAIL(t1=*source)
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	760	) {
				761	++source;
				762	*target++=b;
				763	*target++=t1;
				764	count-=2;
				765	continue;
				766	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	767	}
				768
				769	/* handle "complicated" and error cases, and continuing partial characters */
				770	oldToULength=0;
				771	toULength=1;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	772	toULimit=U8_COUNT_BYTES_NON_ASCII(b);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	773	c=b;
				774	moreBytes:
				775	while(toULength<toULimit) {
				776	if(source<sourceLimit) {
				777	b=*source;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	778	if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	779	++source;
				780	++toULength;
				781	c=(c<<6)+b;
				782	} else {
				783	break; /* sequence too short, stop with toULength<toULimit */
				784	}
				785	} else {
				786	/* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
				787	source-=(toULength-oldToULength);
				788	while(oldToULength<toULength) {
				789	utf8->toUBytes[oldToULength++]=*source++;
				790	}
				791	utf8->toUnicodeStatus=c;
				792	utf8->toULength=toULength;
				793	utf8->mode=toULimit;
				794	pToUArgs->source=(char *)source;
				795	pFromUArgs->target=(char *)target;
				796	return;
				797	}
				798	}
				799
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	800	if(toULength!=toULimit) {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	801	/* error handling: illegal UTF-8 byte sequence */
				802	source-=(toULength-oldToULength);
				803	while(oldToULength<toULength) {
				804	utf8->toUBytes[oldToULength++]=*source++;
				805	}
				806	utf8->toULength=toULength;
				807	pToUArgs->source=(char *)source;
				808	pFromUArgs->target=(char *)target;
				809	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				810	return;
				811	}
				812
				813	/* copy the legal byte sequence to the target */
				814	{
				815	int8_t i;
				816
				817	for(i=0; i<oldToULength; ++i) {
				818	*target++=utf8->toUBytes[i];
				819	}
				820	source-=(toULength-oldToULength);
				821	for(; i<toULength; ++i) {
				822	target++=source++;
				823	}
				824	count-=toULength;
				825	}
				826	}
				827	}
				828
				829	if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
				830	if(target==(const uint8_t *)pFromUArgs->targetLimit) {
				831	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				832	} else {
				833	b=*source;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame]	834	toULimit=U8_COUNT_BYTES(b);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	835	if(toULimit>(sourceLimit-source)) {
				836	/* collect a truncated byte sequence */
				837	toULength=0;
				838	c=b;
				839	for(;;) {
				840	utf8->toUBytes[toULength++]=b;
				841	if(++source==sourceLimit) {
				842	/* partial byte sequence at end of source */
				843	utf8->toUnicodeStatus=c;
				844	utf8->toULength=toULength;
				845	utf8->mode=toULimit;
				846	break;
				847	} else if(!U8_IS_TRAIL(b=*source)) {
				848	/* lead byte in trail byte position */
				849	utf8->toULength=toULength;
				850	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				851	break;
				852	}
				853	c=(c<<6)+b;
				854	}
				855	} else {
				856	/* partial-sequence target overflow: fall back to the pivoting implementation */
				857	*pErrorCode=U_USING_DEFAULT_WARNING;
				858	}
				859	}
				860	}
				861
				862	/* write back the updated pointers */
				863	pToUArgs->source=(char *)source;
				864	pFromUArgs->target=(char *)target;
				865	}
				866
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	867	U_CDECL_END
				868
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	869	/* UTF-8 converter data ----------------------------------------------------- */
				870
				871	static const UConverterImpl _UTF8Impl={
				872	UCNV_UTF8,
				873
				874	NULL,
				875	NULL,
				876
				877	NULL,
				878	NULL,
				879	NULL,
				880
				881	ucnv_toUnicode_UTF8,
				882	ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
				883	ucnv_fromUnicode_UTF8,
				884	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
				885	ucnv_getNextUChar_UTF8,
				886
				887	NULL,
				888	NULL,
				889	NULL,
				890	NULL,
				891	ucnv_getNonSurrogateUnicodeSet,
				892
				893	ucnv_UTF8FromUTF8,
				894	ucnv_UTF8FromUTF8
				895	};
				896
				897	/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
				898	static const UConverterStaticData _UTF8StaticData={
				899	sizeof(UConverterStaticData),
				900	"UTF-8",
				901	1208, UCNV_IBM, UCNV_UTF8,
				902	1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
				903	{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
				904	0,
				905	0,
				906	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				907	};
				908
				909
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	910	const UConverterSharedData _UTF8Data=
				911	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	912
				913	/* CESU-8 converter data ---------------------------------------------------- */
				914
				915	static const UConverterImpl _CESU8Impl={
				916	UCNV_CESU8,
				917
				918	NULL,
				919	NULL,
				920
				921	NULL,
				922	NULL,
				923	NULL,
				924
				925	ucnv_toUnicode_UTF8,
				926	ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
				927	ucnv_fromUnicode_UTF8,
				928	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
				929	NULL,
				930
				931	NULL,
				932	NULL,
				933	NULL,
				934	NULL,
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	935	ucnv_getCompleteUnicodeSet,
				936
				937	NULL,
				938	NULL
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	939	};
				940
				941	static const UConverterStaticData _CESU8StaticData={
				942	sizeof(UConverterStaticData),
				943	"CESU-8",
				944	9400, /* CCSID for CESU-8 */
				945	UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
				946	{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
				947	0,
				948	0,
				949	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				950	};
				951
				952
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	953	const UConverterSharedData _CESU8Data=
				954	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	955
				956	#endif