Blame - source/common/ucnv_u8.c - chromium.googlesource.com/chromium/deps/icu

blob: 15dfbd0c2d12e0a067803e9265b52c1f33993018 [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1	/*
				2	**********************************************************************
				3	* Copyright (C) 2002-2012, International Business Machines
				4	* Corporation and others. All Rights Reserved.
				5	**********************************************************************
				6	* file name: ucnv_u8.c
				7	* encoding: US-ASCII
				8	* tab size: 8 (not used)
				9	* indentation:4
				10	*
				11	* created on: 2002jul01
				12	* created by: Markus W. Scherer
				13	*
				14	* UTF-8 converter implementation. Used to be in ucnv_utf.c.
				15	*
				16	* Also, CESU-8 implementation, see UTR 26.
				17	* The CESU-8 converter uses all the same functions as the
				18	* UTF-8 converter, with a branch for converting supplementary code points.
				19	*/
				20
				21	#include "unicode/utypes.h"
				22
				23	#if !UCONFIG_NO_CONVERSION
				24
				25	#include "unicode/ucnv.h"
				26	#include "unicode/utf.h"
				27	#include "unicode/utf8.h"
				28	#include "unicode/utf16.h"
				29	#include "ucnv_bld.h"
				30	#include "ucnv_cnv.h"
				31	#include "cmemory.h"
				32
				33	/* Prototypes --------------------------------------------------------------- */
				34
				35	/* Keep these here to make finicky compilers happy */
				36
				37	U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
				38	UErrorCode *err);
				39	U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
				40	UErrorCode *err);
				41
				42
				43	/* UTF-8 -------------------------------------------------------------------- */
				44
				45	/* UTF-8 Conversion DATA
				46	* for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
				47	*/
				48	/static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;/
				49	#define MAXIMUM_UCS2 0x0000FFFF
				50	#define MAXIMUM_UTF 0x0010FFFF
				51	#define MAXIMUM_UCS4 0x7FFFFFFF
				52	#define HALF_SHIFT 10
				53	#define HALF_BASE 0x0010000
				54	#define HALF_MASK 0x3FF
				55	#define SURROGATE_HIGH_START 0xD800
				56	#define SURROGATE_HIGH_END 0xDBFF
				57	#define SURROGATE_LOW_START 0xDC00
				58	#define SURROGATE_LOW_END 0xDFFF
				59
				60	/* -SURROGATE_LOW_START + HALF_BASE */
				61	#define SURROGATE_LOW_BASE 9216
				62
				63	static const uint32_t offsetsFromUTF8[7] = {0,
				64	(uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
				65	(uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
				66	};
				67
				68	/* END OF UTF-8 Conversion DATA */
				69
				70	static const int8_t bytesFromUTF8[256] = {
				71	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				72	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				73	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				74	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				75	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				76	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				77	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				78	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				79	};
				80
				81	/*
				82	* Starting with Unicode 3.0.1:
				83	* UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
				84	* byte sequences with more than 4 bytes are illegal in UTF-8,
				85	* which is tested with impossible values for them
				86	*/
				87	static const uint32_t
				88	utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
				89
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	90	static UBool hasCESU8Data(const UConverter *cnv)
				91	{
				92	#if UCONFIG_NO_NON_HTML5_CONVERSION
				93	return FALSE;
				94	#else
				95	return (UBool)(cnv->sharedData == &_CESU8Data);
				96	#endif
				97	}
				98
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	99	static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
				100	UErrorCode * err)
				101	{
				102	UConverter *cnv = args->converter;
				103	const unsigned char mySource = (unsigned char ) args->source;
				104	UChar *myTarget = args->target;
				105	const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;
				106	const UChar *targetLimit = args->targetLimit;
				107	unsigned char *toUBytes = cnv->toUBytes;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	108	UBool isCESU8 = hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	109	uint32_t ch, ch2 = 0;
				110	int32_t i, inBytes;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	111
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	112	/* Restore size of current sequence */
				113	if (cnv->toUnicodeStatus && myTarget < targetLimit)
				114	{
				115	inBytes = cnv->mode; /* restore # of bytes to consume */
				116	i = cnv->toULength; /* restore # of bytes consumed */
				117	cnv->toULength = 0;
				118
				119	ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a previous call/
				120	cnv->toUnicodeStatus = 0;
				121	goto morebytes;
				122	}
				123
				124
				125	while (mySource < sourceLimit && myTarget < targetLimit)
				126	{
				127	ch = *(mySource++);
				128	if (ch < 0x80) /* Simple case */
				129	{
				130	*(myTarget++) = (UChar) ch;
				131	}
				132	else
				133	{
				134	/* store the first char */
				135	toUBytes[0] = (char)ch;
				136	inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
				137	i = 1;
				138
				139	morebytes:
				140	while (i < inBytes)
				141	{
				142	if (mySource < sourceLimit)
				143	{
				144	toUBytes[i] = (char) (ch2 = *mySource);
				145	if (!U8_IS_TRAIL(ch2))
				146	{
				147	break; /* i < inBytes */
				148	}
				149	ch = (ch << 6) + ch2;
				150	++mySource;
				151	i++;
				152	}
				153	else
				154	{
				155	/* stores a partially calculated target*/
				156	cnv->toUnicodeStatus = ch;
				157	cnv->mode = inBytes;
				158	cnv->toULength = (int8_t) i;
				159	goto donefornow;
				160	}
				161	}
				162
				163	/* Remove the accumulated high bits */
				164	ch -= offsetsFromUTF8[inBytes];
				165
				166	/*
				167	* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
				168	* - use only trail bytes after a lead byte (checked above)
				169	* - use the right number of trail bytes for a given lead byte
				170	* - encode a code point <= U+10ffff
				171	* - use the fewest possible number of bytes for their code points
				172	* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
				173	*
				174	* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
				175	* There are no irregular sequences any more.
				176	* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
				177	*/
				178	if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
				179	(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
				180	{
				181	/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
				182	if (ch <= MAXIMUM_UCS2)
				183	{
				184	/* fits in 16 bits */
				185	*(myTarget++) = (UChar) ch;
				186	}
				187	else
				188	{
				189	/* write out the surrogates */
				190	ch -= HALF_BASE;
				191	*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
				192	ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
				193	if (myTarget < targetLimit)
				194	{
				195	*(myTarget++) = (UChar)ch;
				196	}
				197	else
				198	{
				199	/* Put in overflow buffer (not handled here) */
				200	cnv->UCharErrorBuffer[0] = (UChar) ch;
				201	cnv->UCharErrorBufferLength = 1;
				202	*err = U_BUFFER_OVERFLOW_ERROR;
				203	break;
				204	}
				205	}
				206	}
				207	else
				208	{
				209	cnv->toULength = (int8_t)i;
				210	*err = U_ILLEGAL_CHAR_FOUND;
				211	break;
				212	}
				213	}
				214	}
				215
				216	donefornow:
				217	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				218	{
				219	/* End of target buffer */
				220	*err = U_BUFFER_OVERFLOW_ERROR;
				221	}
				222
				223	args->target = myTarget;
				224	args->source = (const char *) mySource;
				225	}
				226
				227	static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
				228	UErrorCode * err)
				229	{
				230	UConverter *cnv = args->converter;
				231	const unsigned char mySource = (unsigned char ) args->source;
				232	UChar *myTarget = args->target;
				233	int32_t *myOffsets = args->offsets;
				234	int32_t offsetNum = 0;
				235	const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;
				236	const UChar *targetLimit = args->targetLimit;
				237	unsigned char *toUBytes = cnv->toUBytes;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	238	UBool isCESU8 = hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	239	uint32_t ch, ch2 = 0;
				240	int32_t i, inBytes;
				241
				242	/* Restore size of current sequence */
				243	if (cnv->toUnicodeStatus && myTarget < targetLimit)
				244	{
				245	inBytes = cnv->mode; /* restore # of bytes to consume */
				246	i = cnv->toULength; /* restore # of bytes consumed */
				247	cnv->toULength = 0;
				248
				249	ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a previous call/
				250	cnv->toUnicodeStatus = 0;
				251	goto morebytes;
				252	}
				253
				254	while (mySource < sourceLimit && myTarget < targetLimit)
				255	{
				256	ch = *(mySource++);
				257	if (ch < 0x80) /* Simple case */
				258	{
				259	*(myTarget++) = (UChar) ch;
				260	*(myOffsets++) = offsetNum++;
				261	}
				262	else
				263	{
				264	toUBytes[0] = (char)ch;
				265	inBytes = bytesFromUTF8[ch];
				266	i = 1;
				267
				268	morebytes:
				269	while (i < inBytes)
				270	{
				271	if (mySource < sourceLimit)
				272	{
				273	toUBytes[i] = (char) (ch2 = *mySource);
				274	if (!U8_IS_TRAIL(ch2))
				275	{
				276	break; /* i < inBytes */
				277	}
				278	ch = (ch << 6) + ch2;
				279	++mySource;
				280	i++;
				281	}
				282	else
				283	{
				284	cnv->toUnicodeStatus = ch;
				285	cnv->mode = inBytes;
				286	cnv->toULength = (int8_t)i;
				287	goto donefornow;
				288	}
				289	}
				290
				291	/* Remove the accumulated high bits */
				292	ch -= offsetsFromUTF8[inBytes];
				293
				294	/*
				295	* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
				296	* - use only trail bytes after a lead byte (checked above)
				297	* - use the right number of trail bytes for a given lead byte
				298	* - encode a code point <= U+10ffff
				299	* - use the fewest possible number of bytes for their code points
				300	* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
				301	*
				302	* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
				303	* There are no irregular sequences any more.
				304	* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
				305	*/
				306	if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
				307	(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
				308	{
				309	/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
				310	if (ch <= MAXIMUM_UCS2)
				311	{
				312	/* fits in 16 bits */
				313	*(myTarget++) = (UChar) ch;
				314	*(myOffsets++) = offsetNum;
				315	}
				316	else
				317	{
				318	/* write out the surrogates */
				319	ch -= HALF_BASE;
				320	*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
				321	*(myOffsets++) = offsetNum;
				322	ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
				323	if (myTarget < targetLimit)
				324	{
				325	*(myTarget++) = (UChar)ch;
				326	*(myOffsets++) = offsetNum;
				327	}
				328	else
				329	{
				330	cnv->UCharErrorBuffer[0] = (UChar) ch;
				331	cnv->UCharErrorBufferLength = 1;
				332	*err = U_BUFFER_OVERFLOW_ERROR;
				333	}
				334	}
				335	offsetNum += i;
				336	}
				337	else
				338	{
				339	cnv->toULength = (int8_t)i;
				340	*err = U_ILLEGAL_CHAR_FOUND;
				341	break;
				342	}
				343	}
				344	}
				345
				346	donefornow:
				347	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				348	{ /* End of target buffer */
				349	*err = U_BUFFER_OVERFLOW_ERROR;
				350	}
				351
				352	args->target = myTarget;
				353	args->source = (const char *) mySource;
				354	args->offsets = myOffsets;
				355	}
				356
				357	U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
				358	UErrorCode * err)
				359	{
				360	UConverter *cnv = args->converter;
				361	const UChar *mySource = args->source;
				362	const UChar *sourceLimit = args->sourceLimit;
				363	uint8_t myTarget = (uint8_t ) args->target;
				364	const uint8_t targetLimit = (uint8_t ) args->targetLimit;
				365	uint8_t *tempPtr;
				366	UChar32 ch;
				367	uint8_t tempBuf[4];
				368	int32_t indexToWrite;
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	369	UBool isNotCESU8 = !hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	370
				371	if (cnv->fromUChar32 && myTarget < targetLimit)
				372	{
				373	ch = cnv->fromUChar32;
				374	cnv->fromUChar32 = 0;
				375	goto lowsurrogate;
				376	}
				377
				378	while (mySource < sourceLimit && myTarget < targetLimit)
				379	{
				380	ch = *(mySource++);
				381
				382	if (ch < 0x80) /* Single byte */
				383	{
				384	*(myTarget++) = (uint8_t) ch;
				385	}
				386	else if (ch < 0x800) /* Double byte */
				387	{
				388	*(myTarget++) = (uint8_t) ((ch >> 6) \| 0xc0);
				389	if (myTarget < targetLimit)
				390	{
				391	*(myTarget++) = (uint8_t) ((ch & 0x3f) \| 0x80);
				392	}
				393	else
				394	{
				395	cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) \| 0x80);
				396	cnv->charErrorBufferLength = 1;
				397	*err = U_BUFFER_OVERFLOW_ERROR;
				398	}
				399	}
				400	else {
				401	/* Check for surrogates */
				402	if(U16_IS_SURROGATE(ch) && isNotCESU8) {
				403	lowsurrogate:
				404	if (mySource < sourceLimit) {
				405	/* test both code units */
				406	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
				407	/* convert and consume this supplementary code point */
				408	ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
				409	++mySource;
				410	/* exit this condition tree */
				411	}
				412	else {
				413	/* this is an unpaired trail or lead code unit */
				414	/* callback(illegal) */
				415	cnv->fromUChar32 = ch;
				416	*err = U_ILLEGAL_CHAR_FOUND;
				417	break;
				418	}
				419	}
				420	else {
				421	/* no more input */
				422	cnv->fromUChar32 = ch;
				423	break;
				424	}
				425	}
				426
				427	/* Do we write the buffer directly for speed,
				428	or do we have to be careful about target buffer space? */
				429	tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
				430
				431	if (ch <= MAXIMUM_UCS2) {
				432	indexToWrite = 2;
				433	tempPtr[0] = (uint8_t) ((ch >> 12) \| 0xe0);
				434	}
				435	else {
				436	indexToWrite = 3;
				437	tempPtr[0] = (uint8_t) ((ch >> 18) \| 0xf0);
				438	tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) \| 0x80);
				439	}
				440	tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) \| 0x80);
				441	tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) \| 0x80);
				442
				443	if (tempPtr == myTarget) {
				444	/* There was enough space to write the codepoint directly. */
				445	myTarget += (indexToWrite + 1);
				446	}
				447	else {
				448	/* We might run out of room soon. Write it slowly. */
				449	for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
				450	if (myTarget < targetLimit) {
				451	(myTarget++) = tempPtr;
				452	}
				453	else {
				454	cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
				455	*err = U_BUFFER_OVERFLOW_ERROR;
				456	}
				457	}
				458	}
				459	}
				460	}
				461
				462	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				463	{
				464	*err = U_BUFFER_OVERFLOW_ERROR;
				465	}
				466
				467	args->target = (char *) myTarget;
				468	args->source = mySource;
				469	}
				470
				471	U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
				472	UErrorCode * err)
				473	{
				474	UConverter *cnv = args->converter;
				475	const UChar *mySource = args->source;
				476	int32_t *myOffsets = args->offsets;
				477	const UChar *sourceLimit = args->sourceLimit;
				478	uint8_t myTarget = (uint8_t ) args->target;
				479	const uint8_t targetLimit = (uint8_t ) args->targetLimit;
				480	uint8_t *tempPtr;
				481	UChar32 ch;
				482	int32_t offsetNum, nextSourceIndex;
				483	int32_t indexToWrite;
				484	uint8_t tempBuf[4];
Jungshik Shin (jungshik at google)	afd723b	2015-01-21 13:24:04 -0800	[diff] [blame]	485	UBool isNotCESU8 = !hasCESU8Data(cnv);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	486
				487	if (cnv->fromUChar32 && myTarget < targetLimit)
				488	{
				489	ch = cnv->fromUChar32;
				490	cnv->fromUChar32 = 0;
				491	offsetNum = -1;
				492	nextSourceIndex = 0;
				493	goto lowsurrogate;
				494	} else {
				495	offsetNum = 0;
				496	}
				497
				498	while (mySource < sourceLimit && myTarget < targetLimit)
				499	{
				500	ch = *(mySource++);
				501
				502	if (ch < 0x80) /* Single byte */
				503	{
				504	*(myOffsets++) = offsetNum++;
				505	*(myTarget++) = (char) ch;
				506	}
				507	else if (ch < 0x800) /* Double byte */
				508	{
				509	*(myOffsets++) = offsetNum;
				510	*(myTarget++) = (uint8_t) ((ch >> 6) \| 0xc0);
				511	if (myTarget < targetLimit)
				512	{
				513	*(myOffsets++) = offsetNum++;
				514	*(myTarget++) = (uint8_t) ((ch & 0x3f) \| 0x80);
				515	}
				516	else
				517	{
				518	cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) \| 0x80);
				519	cnv->charErrorBufferLength = 1;
				520	*err = U_BUFFER_OVERFLOW_ERROR;
				521	}
				522	}
				523	else
				524	/* Check for surrogates */
				525	{
				526	nextSourceIndex = offsetNum + 1;
				527
				528	if(U16_IS_SURROGATE(ch) && isNotCESU8) {
				529	lowsurrogate:
				530	if (mySource < sourceLimit) {
				531	/* test both code units */
				532	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
				533	/* convert and consume this supplementary code point */
				534	ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
				535	++mySource;
				536	++nextSourceIndex;
				537	/* exit this condition tree */
				538	}
				539	else {
				540	/* this is an unpaired trail or lead code unit */
				541	/* callback(illegal) */
				542	cnv->fromUChar32 = ch;
				543	*err = U_ILLEGAL_CHAR_FOUND;
				544	break;
				545	}
				546	}
				547	else {
				548	/* no more input */
				549	cnv->fromUChar32 = ch;
				550	break;
				551	}
				552	}
				553
				554	/* Do we write the buffer directly for speed,
				555	or do we have to be careful about target buffer space? */
				556	tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
				557
				558	if (ch <= MAXIMUM_UCS2) {
				559	indexToWrite = 2;
				560	tempPtr[0] = (uint8_t) ((ch >> 12) \| 0xe0);
				561	}
				562	else {
				563	indexToWrite = 3;
				564	tempPtr[0] = (uint8_t) ((ch >> 18) \| 0xf0);
				565	tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) \| 0x80);
				566	}
				567	tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) \| 0x80);
				568	tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) \| 0x80);
				569
				570	if (tempPtr == myTarget) {
				571	/* There was enough space to write the codepoint directly. */
				572	myTarget += (indexToWrite + 1);
				573	myOffsets[0] = offsetNum;
				574	myOffsets[1] = offsetNum;
				575	myOffsets[2] = offsetNum;
				576	if (indexToWrite >= 3) {
				577	myOffsets[3] = offsetNum;
				578	}
				579	myOffsets += (indexToWrite + 1);
				580	}
				581	else {
				582	/* We might run out of room soon. Write it slowly. */
				583	for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
				584	if (myTarget < targetLimit)
				585	{
				586	*(myOffsets++) = offsetNum;
				587	(myTarget++) = tempPtr;
				588	}
				589	else
				590	{
				591	cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
				592	*err = U_BUFFER_OVERFLOW_ERROR;
				593	}
				594	}
				595	}
				596	offsetNum = nextSourceIndex;
				597	}
				598	}
				599
				600	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
				601	{
				602	*err = U_BUFFER_OVERFLOW_ERROR;
				603	}
				604
				605	args->target = (char *) myTarget;
				606	args->source = mySource;
				607	args->offsets = myOffsets;
				608	}
				609
				610	static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
				611	UErrorCode *err) {
				612	UConverter *cnv;
				613	const uint8_t *sourceInitial;
				614	const uint8_t *source;
				615	uint16_t extraBytesToWrite;
				616	uint8_t myByte;
				617	UChar32 ch;
				618	int8_t i, isLegalSequence;
				619
				620	/* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
				621
				622	cnv = args->converter;
				623	sourceInitial = source = (const uint8_t *)args->source;
				624	if (source >= (const uint8_t *)args->sourceLimit)
				625	{
				626	/* no input */
				627	*err = U_INDEX_OUTOFBOUNDS_ERROR;
				628	return 0xffff;
				629	}
				630
				631	myByte = (uint8_t)*(source++);
				632	if (myByte < 0x80)
				633	{
				634	args->source = (const char *)source;
				635	return (UChar32)myByte;
				636	}
				637
				638	extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
				639	if (extraBytesToWrite == 0) {
				640	cnv->toUBytes[0] = myByte;
				641	cnv->toULength = 1;
				642	*err = U_ILLEGAL_CHAR_FOUND;
				643	args->source = (const char *)source;
				644	return 0xffff;
				645	}
				646
				647	/The byte sequence is longer than the buffer area passed/
				648	if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
				649	{
				650	/* check if all of the remaining bytes are trail bytes */
				651	cnv->toUBytes[0] = myByte;
				652	i = 1;
				653	*err = U_TRUNCATED_CHAR_FOUND;
				654	while(source < (const uint8_t *)args->sourceLimit) {
				655	if(U8_IS_TRAIL(myByte = *source)) {
				656	cnv->toUBytes[i++] = myByte;
				657	++source;
				658	} else {
				659	/* error even before we run out of input */
				660	*err = U_ILLEGAL_CHAR_FOUND;
				661	break;
				662	}
				663	}
				664	cnv->toULength = i;
				665	args->source = (const char *)source;
				666	return 0xffff;
				667	}
				668
				669	isLegalSequence = 1;
				670	ch = myByte << 6;
				671	switch(extraBytesToWrite)
				672	{
				673	/* note: code falls through cases! (sic)*/
				674	case 6:
				675	ch += (myByte = *source);
				676	ch <<= 6;
				677	if (!U8_IS_TRAIL(myByte))
				678	{
				679	isLegalSequence = 0;
				680	break;
				681	}
				682	++source;
				683	case 5: /fall through/
				684	ch += (myByte = *source);
				685	ch <<= 6;
				686	if (!U8_IS_TRAIL(myByte))
				687	{
				688	isLegalSequence = 0;
				689	break;
				690	}
				691	++source;
				692	case 4: /fall through/
				693	ch += (myByte = *source);
				694	ch <<= 6;
				695	if (!U8_IS_TRAIL(myByte))
				696	{
				697	isLegalSequence = 0;
				698	break;
				699	}
				700	++source;
				701	case 3: /fall through/
				702	ch += (myByte = *source);
				703	ch <<= 6;
				704	if (!U8_IS_TRAIL(myByte))
				705	{
				706	isLegalSequence = 0;
				707	break;
				708	}
				709	++source;
				710	case 2: /fall through/
				711	ch += (myByte = *source);
				712	if (!U8_IS_TRAIL(myByte))
				713	{
				714	isLegalSequence = 0;
				715	break;
				716	}
				717	++source;
				718	};
				719	ch -= offsetsFromUTF8[extraBytesToWrite];
				720	args->source = (const char *)source;
				721
				722	/*
				723	* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
				724	* - use only trail bytes after a lead byte (checked above)
				725	* - use the right number of trail bytes for a given lead byte
				726	* - encode a code point <= U+10ffff
				727	* - use the fewest possible number of bytes for their code points
				728	* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
				729	*
				730	* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
				731	* There are no irregular sequences any more.
				732	*/
				733	if (isLegalSequence &&
				734	(uint32_t)ch <= MAXIMUM_UTF &&
				735	(uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
				736	!U_IS_SURROGATE(ch)
				737	) {
				738	return ch; /* return the code point */
				739	}
				740
				741	for(i = 0; sourceInitial < source; ++i) {
				742	cnv->toUBytes[i] = *sourceInitial++;
				743	}
				744	cnv->toULength = i;
				745	*err = U_ILLEGAL_CHAR_FOUND;
				746	return 0xffff;
				747	}
				748
				749	/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
				750
				751	/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
				752	static const UChar32
				753	utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
				754
				755	/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
				756	static const UChar32
				757	utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
				758
				759	/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
				760	static void
				761	ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
				762	UConverterToUnicodeArgs *pToUArgs,
				763	UErrorCode *pErrorCode) {
				764	UConverter *utf8;
				765	const uint8_t source, sourceLimit;
				766	uint8_t *target;
				767	int32_t targetCapacity;
				768	int32_t count;
				769
				770	int8_t oldToULength, toULength, toULimit;
				771
				772	UChar32 c;
				773	uint8_t b, t1, t2;
				774
				775	/* set up the local pointers */
				776	utf8=pToUArgs->converter;
				777	source=(uint8_t *)pToUArgs->source;
				778	sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
				779	target=(uint8_t *)pFromUArgs->target;
				780	targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
				781
				782	/* get the converter state from the UTF-8 UConverter */
				783	c=(UChar32)utf8->toUnicodeStatus;
				784	if(c!=0) {
				785	toULength=oldToULength=utf8->toULength;
				786	toULimit=(int8_t)utf8->mode;
				787	} else {
				788	toULength=oldToULength=toULimit=0;
				789	}
				790
				791	count=(int32_t)(sourceLimit-source)+oldToULength;
				792	if(count<toULimit) {
				793	/*
				794	* Not enough input to complete the partial character.
				795	* Jump to moreBytes below - it will not output to target.
				796	*/
				797	} else if(targetCapacity<toULimit) {
				798	/*
				799	* Not enough target capacity to output the partial character.
				800	* Let the standard converter handle this.
				801	*/
				802	*pErrorCode=U_USING_DEFAULT_WARNING;
				803	return;
				804	} else {
				805	/*
				806	* Use a single counter for source and target, counting the minimum of
				807	* the source length and the target capacity.
				808	* As a result, the source length is checked only once per multi-byte
				809	* character instead of twice.
				810	*
				811	* Make sure that the last byte sequence is complete, or else
				812	* stop just before it.
				813	* (The longest legal byte sequence has 3 trail bytes.)
				814	* Count oldToULength (number of source bytes from a previous buffer)
				815	* into the source length but reduce the source index by toULimit
				816	* while going back over trail bytes in order to not go back into
				817	* the bytes that will be read for finishing a partial
				818	* sequence from the previous buffer.
				819	* Let the standard converter handle edge cases.
				820	*/
				821	int32_t i;
				822
				823	if(count>targetCapacity) {
				824	count=targetCapacity;
				825	}
				826
				827	i=0;
				828	while(i<3 && i<(count-toULimit)) {
				829	b=source[count-oldToULength-i-1];
				830	if(U8_IS_TRAIL(b)) {
				831	++i;
				832	} else {
				833	if(i<U8_COUNT_TRAIL_BYTES(b)) {
				834	/* stop converting before the lead byte if there are not enough trail bytes for it */
				835	count-=i+1;
				836	}
				837	break;
				838	}
				839	}
				840	}
				841
				842	if(c!=0) {
				843	utf8->toUnicodeStatus=0;
				844	utf8->toULength=0;
				845	goto moreBytes;
				846	/* See note in ucnv_SBCSFromUTF8() about this goto. */
				847	}
				848
				849	/* conversion loop */
				850	while(count>0) {
				851	b=*source++;
				852	if((int8_t)b>=0) {
				853	/* convert ASCII */
				854	*target++=b;
				855	--count;
				856	continue;
				857	} else {
				858	if(b>0xe0) {
				859	if( /* handle U+1000..U+D7FF inline */
				860	(t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) \|\|
				861	(b==0xed && (t1 <= 0x9f))) &&
				862	(t2=source[1]) >= 0x80 && t2 <= 0xbf
				863	) {
				864	source+=2;
				865	*target++=b;
				866	*target++=t1;
				867	*target++=t2;
				868	count-=3;
				869	continue;
				870	}
				871	} else if(b<0xe0) {
				872	if( /* handle U+0080..U+07FF inline */
				873	b>=0xc2 &&
				874	(t1=*source) >= 0x80 && t1 <= 0xbf
				875	) {
				876	++source;
				877	*target++=b;
				878	*target++=t1;
				879	count-=2;
				880	continue;
				881	}
				882	} else if(b==0xe0) {
				883	if( /* handle U+0800..U+0FFF inline */
				884	(t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
				885	(t2=source[1]) >= 0x80 && t2 <= 0xbf
				886	) {
				887	source+=2;
				888	*target++=b;
				889	*target++=t1;
				890	*target++=t2;
				891	count-=3;
				892	continue;
				893	}
				894	}
				895
				896	/* handle "complicated" and error cases, and continuing partial characters */
				897	oldToULength=0;
				898	toULength=1;
				899	toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
				900	c=b;
				901	moreBytes:
				902	while(toULength<toULimit) {
				903	if(source<sourceLimit) {
				904	b=*source;
				905	if(U8_IS_TRAIL(b)) {
				906	++source;
				907	++toULength;
				908	c=(c<<6)+b;
				909	} else {
				910	break; /* sequence too short, stop with toULength<toULimit */
				911	}
				912	} else {
				913	/* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
				914	source-=(toULength-oldToULength);
				915	while(oldToULength<toULength) {
				916	utf8->toUBytes[oldToULength++]=*source++;
				917	}
				918	utf8->toUnicodeStatus=c;
				919	utf8->toULength=toULength;
				920	utf8->mode=toULimit;
				921	pToUArgs->source=(char *)source;
				922	pFromUArgs->target=(char *)target;
				923	return;
				924	}
				925	}
				926
				927	if( toULength==toULimit && /* consumed all trail bytes */
				928	(toULength==3 \|\| toULength==2) && /* BMP */
				929	(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
				930	(c<=0xd7ff \|\| 0xe000<=c) /* not a surrogate */
				931	) {
				932	/* legal byte sequence for BMP code point */
				933	} else if(
				934	toULength==toULimit && toULength==4 &&
				935	(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
				936	) {
				937	/* legal byte sequence for supplementary code point */
				938	} else {
				939	/* error handling: illegal UTF-8 byte sequence */
				940	source-=(toULength-oldToULength);
				941	while(oldToULength<toULength) {
				942	utf8->toUBytes[oldToULength++]=*source++;
				943	}
				944	utf8->toULength=toULength;
				945	pToUArgs->source=(char *)source;
				946	pFromUArgs->target=(char *)target;
				947	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				948	return;
				949	}
				950
				951	/* copy the legal byte sequence to the target */
				952	{
				953	int8_t i;
				954
				955	for(i=0; i<oldToULength; ++i) {
				956	*target++=utf8->toUBytes[i];
				957	}
				958	source-=(toULength-oldToULength);
				959	for(; i<toULength; ++i) {
				960	target++=source++;
				961	}
				962	count-=toULength;
				963	}
				964	}
				965	}
				966
				967	if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
				968	if(target==(const uint8_t *)pFromUArgs->targetLimit) {
				969	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				970	} else {
				971	b=*source;
				972	toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
				973	if(toULimit>(sourceLimit-source)) {
				974	/* collect a truncated byte sequence */
				975	toULength=0;
				976	c=b;
				977	for(;;) {
				978	utf8->toUBytes[toULength++]=b;
				979	if(++source==sourceLimit) {
				980	/* partial byte sequence at end of source */
				981	utf8->toUnicodeStatus=c;
				982	utf8->toULength=toULength;
				983	utf8->mode=toULimit;
				984	break;
				985	} else if(!U8_IS_TRAIL(b=*source)) {
				986	/* lead byte in trail byte position */
				987	utf8->toULength=toULength;
				988	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				989	break;
				990	}
				991	c=(c<<6)+b;
				992	}
				993	} else {
				994	/* partial-sequence target overflow: fall back to the pivoting implementation */
				995	*pErrorCode=U_USING_DEFAULT_WARNING;
				996	}
				997	}
				998	}
				999
				1000	/* write back the updated pointers */
				1001	pToUArgs->source=(char *)source;
				1002	pFromUArgs->target=(char *)target;
				1003	}
				1004
				1005	/* UTF-8 converter data ----------------------------------------------------- */
				1006
				1007	static const UConverterImpl _UTF8Impl={
				1008	UCNV_UTF8,
				1009
				1010	NULL,
				1011	NULL,
				1012
				1013	NULL,
				1014	NULL,
				1015	NULL,
				1016
				1017	ucnv_toUnicode_UTF8,
				1018	ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
				1019	ucnv_fromUnicode_UTF8,
				1020	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
				1021	ucnv_getNextUChar_UTF8,
				1022
				1023	NULL,
				1024	NULL,
				1025	NULL,
				1026	NULL,
				1027	ucnv_getNonSurrogateUnicodeSet,
				1028
				1029	ucnv_UTF8FromUTF8,
				1030	ucnv_UTF8FromUTF8
				1031	};
				1032
				1033	/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
				1034	static const UConverterStaticData _UTF8StaticData={
				1035	sizeof(UConverterStaticData),
				1036	"UTF-8",
				1037	1208, UCNV_IBM, UCNV_UTF8,
				1038	1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
				1039	{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
				1040	0,
				1041	0,
				1042	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1043	};
				1044
				1045
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame^]	1046	const UConverterSharedData _UTF8Data=
				1047	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1048
				1049	/* CESU-8 converter data ---------------------------------------------------- */
				1050
				1051	static const UConverterImpl _CESU8Impl={
				1052	UCNV_CESU8,
				1053
				1054	NULL,
				1055	NULL,
				1056
				1057	NULL,
				1058	NULL,
				1059	NULL,
				1060
				1061	ucnv_toUnicode_UTF8,
				1062	ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
				1063	ucnv_fromUnicode_UTF8,
				1064	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
				1065	NULL,
				1066
				1067	NULL,
				1068	NULL,
				1069	NULL,
				1070	NULL,
				1071	ucnv_getCompleteUnicodeSet
				1072	};
				1073
				1074	static const UConverterStaticData _CESU8StaticData={
				1075	sizeof(UConverterStaticData),
				1076	"CESU-8",
				1077	9400, /* CCSID for CESU-8 */
				1078	UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
				1079	{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
				1080	0,
				1081	0,
				1082	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1083	};
				1084
				1085
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame^]	1086	const UConverterSharedData _CESU8Data=
				1087	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1088
				1089	#endif