Blame - source/common/ucnvbocu.cpp - chromium.googlesource.com/chromium/deps/icu

blob: edb49d36a9ce53d1c2c827efa4c132aa6babb476 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	******************************************************************************
				5	*
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	6	* Copyright (C) 2002-2016, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	7	* Corporation and others. All Rights Reserved.
				8	*
				9	******************************************************************************
				10	* file name: ucnvbocu.cpp
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	11	* encoding: UTF-8
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	12	* tab size: 8 (not used)
				13	* indentation:4
				14	*
				15	* created on: 2002mar27
				16	* created by: Markus W. Scherer
				17	*
				18	* This is an implementation of the Binary Ordered Compression for Unicode,
				19	* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
				20	*/
				21
				22	#include "unicode/utypes.h"
				23
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	24	#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	25
				26	#include "unicode/ucnv.h"
				27	#include "unicode/ucnv_cb.h"
				28	#include "unicode/utf16.h"
				29	#include "putilimp.h"
				30	#include "ucnv_bld.h"
				31	#include "ucnv_cnv.h"
				32	#include "uassert.h"
				33
				34	/* BOCU-1 constants and macros ---------------------------------------------- */
				35
				36	/*
				37	* BOCU-1 encodes the code points of a Unicode string as
				38	* a sequence of byte-encoded differences (slope detection),
				39	* preserving lexical order.
				40	*
				41	* Optimize the difference-taking for runs of Unicode text within
				42	* small scripts:
				43	*
				44	* Most small scripts are allocated within aligned 128-blocks of Unicode
				45	* code points. Lexical order is preserved if the "previous code point" state
				46	* is always moved into the middle of such a block.
				47	*
				48	* Additionally, "prev" is moved from anywhere in the Unihan and Hangul
				49	* areas into the middle of those areas.
				50	*
				51	* C0 control codes and space are encoded with their US-ASCII bytes.
				52	* "prev" is reset for C0 controls but not for space.
				53	*/
				54
				55	/* initial value for "prev": middle of the ASCII range */
				56	#define BOCU1_ASCII_PREV 0x40
				57
				58	/* bounding byte values for differences */
				59	#define BOCU1_MIN 0x21
				60	#define BOCU1_MIDDLE 0x90
				61	#define BOCU1_MAX_LEAD 0xfe
				62	#define BOCU1_MAX_TRAIL 0xff
				63	#define BOCU1_RESET 0xff
				64
				65	/* number of lead bytes */
				66	#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
				67
				68	/* adjust trail byte counts for the use of some C0 control byte values */
				69	#define BOCU1_TRAIL_CONTROLS_COUNT 20
				70	#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
				71
				72	/* number of trail bytes */
				73	#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
				74
				75	/*
				76	* number of positive and negative single-byte codes
				77	* (counting 0==BOCU1_MIDDLE among the positive ones)
				78	*/
				79	#define BOCU1_SINGLE 64
				80
				81	/* number of lead bytes for positive and negative 2/3/4-byte sequences */
				82	#define BOCU1_LEAD_2 43
				83	#define BOCU1_LEAD_3 3
				84	#define BOCU1_LEAD_4 1
				85
				86	/* The difference value range for single-byters. */
				87	#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
				88	#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
				89
				90	/* The difference value range for double-byters. */
				91	#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
				92	#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
				93
				94	/* The difference value range for 3-byters. */
				95	#define BOCU1_REACH_POS_3 \
				96	(BOCU1_REACH_POS_2+BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)
				97
				98	#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)
				99
				100	/* The lead byte start values. */
				101	#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
				102	#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
				103	#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
				104	/* ==BOCU1_MAX_LEAD */
				105
				106	#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
				107	#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
				108	#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
				109	/* ==BOCU1_MIN+1 */
				110
				111	/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
				112	#define BOCU1_LENGTH_FROM_LEAD(lead) \
				113	((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
				114	(BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
				115	(BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
				116
				117	/* The length of a byte sequence, according to its packed form. */
				118	#define BOCU1_LENGTH_FROM_PACKED(packed) \
				119	((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
				120
				121	/*
				122	* 12 commonly used C0 control codes (and space) are only used to encode
				123	* themselves directly,
				124	* which makes BOCU-1 MIME-usable and reasonably safe for
				125	* ASCII-oriented software.
				126	*
				127	* These controls are
				128	* 0 NUL
				129	*
				130	* 7 BEL
				131	* 8 BS
				132	*
				133	* 9 TAB
				134	* a LF
				135	* b VT
				136	* c FF
				137	* d CR
				138	*
				139	* e SO
				140	* f SI
				141	*
				142	* 1a SUB
				143	* 1b ESC
				144	*
				145	* The other 20 C0 controls are also encoded directly (to preserve order)
				146	* but are also used as trail bytes in difference encoding
				147	* (for better compression).
				148	*/
				149	#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
				150
				151	/*
				152	* Byte value map for control codes,
				153	* from external byte values 0x00..0x20
				154	* to trail byte values 0..19 (0..0x13) as used in the difference calculation.
				155	* External byte values that are illegal as trail bytes are mapped to -1.
				156	*/
				157	static const int8_t
				158	bocu1ByteToTrail[BOCU1_MIN]={
				159	/* 0 1 2 3 4 5 6 7 */
				160	-1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
				161
				162	/* 8 9 a b c d e f */
				163	-1, -1, -1, -1, -1, -1, -1, -1,
				164
				165	/* 10 11 12 13 14 15 16 17 */
				166	0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
				167
				168	/* 18 19 1a 1b 1c 1d 1e 1f */
				169	0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
				170
				171	/* 20 */
				172	-1
				173	};
				174
				175	/*
				176	* Byte value map for control codes,
				177	* from trail byte values 0..19 (0..0x13) as used in the difference calculation
				178	* to external byte values 0x00..0x20.
				179	*/
				180	static const int8_t
				181	bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
				182	/* 0 1 2 3 4 5 6 7 */
				183	0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
				184
				185	/* 8 9 a b c d e f */
				186	0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
				187
				188	/* 10 11 12 13 */
				189	0x1c, 0x1d, 0x1e, 0x1f
				190	};
				191
				192	/**
				193	* Integer division and modulo with negative numerators
				194	* yields negative modulo results and quotients that are one more than
				195	* what we need here.
				196	* This macro adjust the results so that the modulo-value m is always >=0.
				197	*
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	198	* For positive n, the if() condition is always false.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	199	*
				200	* @param n Number to be split into quotient and rest.
				201	* Will be modified to contain the quotient.
				202	* @param d Divisor.
				203	* @param m Output variable for the rest (modulo result).
				204	*/
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	205	#define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	206	(m)=(n)%(d); \
				207	(n)/=(d); \
				208	if((m)<0) { \
				209	--(n); \
				210	(m)+=(d); \
				211	} \
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	212	} UPRV_BLOCK_MACRO_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	213
				214	/* Faster versions of packDiff() for single-byte-encoded diff values. */
				215
				216	/** Is a diff value encodable in a single byte? */
				217	#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
				218
				219	/** Encode a diff value in a single byte. */
				220	#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
				221
				222	/** Is a diff value encodable in two bytes? */
				223	#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
				224
				225	/* BOCU-1 implementation functions ------------------------------------------ */
				226
				227	#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
				228
				229	/**
				230	* Compute the next "previous" value for differencing
				231	* from the current code point.
				232	*
				233	* @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
				234	* @return "previous code point" state value
				235	*/
				236	static inline int32_t
				237	bocu1Prev(int32_t c) {
				238	/* compute new prev */
				239	if(/* 0x3040<=c && */ c<=0x309f) {
				240	/* Hiragana is not 128-aligned */
				241	return 0x3070;
				242	} else if(0x4e00<=c && c<=0x9fa5) {
				243	/* CJK Unihan */
				244	return 0x4e00-BOCU1_REACH_NEG_2;
				245	} else if(0xac00<=c /* && c<=0xd7a3 */) {
				246	/* Korean Hangul */
				247	return (0xd7a3+0xac00)/2;
				248	} else {
				249	/* mostly small scripts */
				250	return BOCU1_SIMPLE_PREV(c);
				251	}
				252	}
				253
				254	/** Fast version of bocu1Prev() for most scripts. */
				255	#define BOCU1_PREV(c) ((c)<0x3040 \|\| (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
				256
				257	/*
				258	* The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
				259	* The UConverter fields are used as follows:
				260	*
				261	* fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
				262	*
				263	* toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
				264	* mode decoder's incomplete (diff<<2)\|count (ignored when toULength==0)
				265	*/
				266
				267	/* BOCU-1-from-Unicode conversion functions --------------------------------- */
				268
				269	/**
				270	* Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
				271	* and return a packed integer with them.
				272	*
				273	* The encoding favors small absolute differences with short encodings
				274	* to compress runs of same-script characters.
				275	*
				276	* Optimized version with unrolled loops and fewer floating-point operations
				277	* than the standard packDiff().
				278	*
				279	* @param diff difference value -0x10ffff..0x10ffff
				280	* @return
				281	* 0x010000zz for 1-byte sequence zz
				282	* 0x0200yyzz for 2-byte sequence yy zz
				283	* 0x03xxyyzz for 3-byte sequence xx yy zz
				284	* 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
				285	*/
				286	static int32_t
				287	packDiff(int32_t diff) {
				288	int32_t result, m;
				289
				290	U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
				291	if(diff>=BOCU1_REACH_NEG_1) {
				292	/* mostly positive differences, and single-byte negative ones */
				293	#if 0 /* single-byte case handled in macros, see below */
				294	if(diff<=BOCU1_REACH_POS_1) {
				295	/* single byte */
				296	return 0x01000000\|(BOCU1_MIDDLE+diff);
				297	} else
				298	#endif
				299	if(diff<=BOCU1_REACH_POS_2) {
				300	/* two bytes */
				301	diff-=BOCU1_REACH_POS_1+1;
				302	result=0x02000000;
				303
				304	m=diff%BOCU1_TRAIL_COUNT;
				305	diff/=BOCU1_TRAIL_COUNT;
				306	result\|=BOCU1_TRAIL_TO_BYTE(m);
				307
				308	result\|=(BOCU1_START_POS_2+diff)<<8;
				309	} else if(diff<=BOCU1_REACH_POS_3) {
				310	/* three bytes */
				311	diff-=BOCU1_REACH_POS_2+1;
				312	result=0x03000000;
				313
				314	m=diff%BOCU1_TRAIL_COUNT;
				315	diff/=BOCU1_TRAIL_COUNT;
				316	result\|=BOCU1_TRAIL_TO_BYTE(m);
				317
				318	m=diff%BOCU1_TRAIL_COUNT;
				319	diff/=BOCU1_TRAIL_COUNT;
				320	result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;
				321
				322	result\|=(BOCU1_START_POS_3+diff)<<16;
				323	} else {
				324	/* four bytes */
				325	diff-=BOCU1_REACH_POS_3+1;
				326
				327	m=diff%BOCU1_TRAIL_COUNT;
				328	diff/=BOCU1_TRAIL_COUNT;
				329	result=BOCU1_TRAIL_TO_BYTE(m);
				330
				331	m=diff%BOCU1_TRAIL_COUNT;
				332	diff/=BOCU1_TRAIL_COUNT;
				333	result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;
				334
				335	/*
				336	* We know that / and % would deliver quotient 0 and rest=diff.
				337	* Avoid division and modulo for performance.
				338	*/
				339	result\|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
				340
				341	result\|=((uint32_t)BOCU1_START_POS_4)<<24;
				342	}
				343	} else {
				344	/* two- to four-byte negative differences */
				345	if(diff>=BOCU1_REACH_NEG_2) {
				346	/* two bytes */
				347	diff-=BOCU1_REACH_NEG_1;
				348	result=0x02000000;
				349
				350	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
				351	result\|=BOCU1_TRAIL_TO_BYTE(m);
				352
				353	result\|=(BOCU1_START_NEG_2+diff)<<8;
				354	} else if(diff>=BOCU1_REACH_NEG_3) {
				355	/* three bytes */
				356	diff-=BOCU1_REACH_NEG_2;
				357	result=0x03000000;
				358
				359	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
				360	result\|=BOCU1_TRAIL_TO_BYTE(m);
				361
				362	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
				363	result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;
				364
				365	result\|=(BOCU1_START_NEG_3+diff)<<16;
				366	} else {
				367	/* four bytes */
				368	diff-=BOCU1_REACH_NEG_3;
				369
				370	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
				371	result=BOCU1_TRAIL_TO_BYTE(m);
				372
				373	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
				374	result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;
				375
				376	/*
				377	* We know that NEGDIVMOD would deliver
				378	* quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
				379	* Avoid division and modulo for performance.
				380	*/
				381	m=diff+BOCU1_TRAIL_COUNT;
				382	result\|=BOCU1_TRAIL_TO_BYTE(m)<<16;
				383
				384	result\|=BOCU1_MIN<<24;
				385	}
				386	}
				387	return result;
				388	}
				389
				390
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	391	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	392	_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				393	UErrorCode *pErrorCode) {
				394	UConverter *cnv;
				395	const UChar source, sourceLimit;
				396	uint8_t *target;
				397	int32_t targetCapacity;
				398	int32_t *offsets;
				399
				400	int32_t prev, c, diff;
				401
				402	int32_t sourceIndex, nextSourceIndex;
				403
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	404	/* set up the local pointers */
				405	cnv=pArgs->converter;
				406	source=pArgs->source;
				407	sourceLimit=pArgs->sourceLimit;
				408	target=(uint8_t *)pArgs->target;
				409	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
				410	offsets=pArgs->offsets;
				411
				412	/* get the converter state from UConverter */
				413	c=cnv->fromUChar32;
				414	prev=(int32_t)cnv->fromUnicodeStatus;
				415	if(prev==0) {
				416	prev=BOCU1_ASCII_PREV;
				417	}
				418
				419	/* sourceIndex=-1 if the current character began in the previous buffer */
				420	sourceIndex= c==0 ? 0 : -1;
				421	nextSourceIndex=0;
				422
				423	/* conversion loop */
				424	if(c!=0 && targetCapacity>0) {
				425	goto getTrail;
				426	}
				427
				428	fastSingle:
				429	/* fast loop for single-byte differences */
				430	/* use only one loop counter variable, targetCapacity, not also source */
				431	diff=(int32_t)(sourceLimit-source);
				432	if(targetCapacity>diff) {
				433	targetCapacity=diff;
				434	}
				435	while(targetCapacity>0 && (c=*source)<0x3000) {
				436	if(c<=0x20) {
				437	if(c!=0x20) {
				438	prev=BOCU1_ASCII_PREV;
				439	}
				440	*target++=(uint8_t)c;
				441	*offsets++=nextSourceIndex++;
				442	++source;
				443	--targetCapacity;
				444	} else {
				445	diff=c-prev;
				446	if(DIFF_IS_SINGLE(diff)) {
				447	prev=BOCU1_SIMPLE_PREV(c);
				448	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
				449	*offsets++=nextSourceIndex++;
				450	++source;
				451	--targetCapacity;
				452	} else {
				453	break;
				454	}
				455	}
				456	}
				457	/* restore real values */
				458	targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
				459	sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
				460
				461	/* regular loop for all cases */
				462	while(source<sourceLimit) {
				463	if(targetCapacity>0) {
				464	c=*source++;
				465	++nextSourceIndex;
				466
				467	if(c<=0x20) {
				468	/*
				469	* ISO C0 control & space:
				470	* Encode directly for MIME compatibility,
				471	* and reset state except for space, to not disrupt compression.
				472	*/
				473	if(c!=0x20) {
				474	prev=BOCU1_ASCII_PREV;
				475	}
				476	*target++=(uint8_t)c;
				477	*offsets++=sourceIndex;
				478	--targetCapacity;
				479
				480	sourceIndex=nextSourceIndex;
				481	continue;
				482	}
				483
				484	if(U16_IS_LEAD(c)) {
				485	getTrail:
				486	if(source<sourceLimit) {
				487	/* test the following code unit */
				488	UChar trail=*source;
				489	if(U16_IS_TRAIL(trail)) {
				490	++source;
				491	++nextSourceIndex;
				492	c=U16_GET_SUPPLEMENTARY(c, trail);
				493	}
				494	} else {
				495	/* no more input */
				496	c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
				497	break;
				498	}
				499	}
				500
				501	/*
				502	* all other Unicode code points c==U+0021..U+10ffff
				503	* are encoded with the difference c-prev
				504	*
				505	* a new prev is computed from c,
				506	* placed in the middle of a 0x80-block (for most small scripts) or
				507	* in the middle of the Unihan and Hangul blocks
				508	* to statistically minimize the following difference
				509	*/
				510	diff=c-prev;
				511	prev=BOCU1_PREV(c);
				512	if(DIFF_IS_SINGLE(diff)) {
				513	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
				514	*offsets++=sourceIndex;
				515	--targetCapacity;
				516	sourceIndex=nextSourceIndex;
				517	if(c<0x3000) {
				518	goto fastSingle;
				519	}
				520	} else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
				521	/* optimize 2-byte case */
				522	int32_t m;
				523
				524	if(diff>=0) {
				525	diff-=BOCU1_REACH_POS_1+1;
				526	m=diff%BOCU1_TRAIL_COUNT;
				527	diff/=BOCU1_TRAIL_COUNT;
				528	diff+=BOCU1_START_POS_2;
				529	} else {
				530	diff-=BOCU1_REACH_NEG_1;
				531	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
				532	diff+=BOCU1_START_NEG_2;
				533	}
				534	*target++=(uint8_t)diff;
				535	*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
				536	*offsets++=sourceIndex;
				537	*offsets++=sourceIndex;
				538	targetCapacity-=2;
				539	sourceIndex=nextSourceIndex;
				540	} else {
				541	int32_t length; /* will be 2..4 */
				542
				543	diff=packDiff(diff);
				544	length=BOCU1_LENGTH_FROM_PACKED(diff);
				545
				546	/* write the output character bytes from diff and length */
				547	/* from the first if in the loop we know that targetCapacity>0 */
				548	if(length<=targetCapacity) {
				549	switch(length) {
				550	/* each branch falls through to the next one */
				551	case 4:
				552	*target++=(uint8_t)(diff>>24);
				553	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	554	U_FALLTHROUGH;
				555	case 3:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	556	*target++=(uint8_t)(diff>>16);
				557	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	558	U_FALLTHROUGH;
				559	case 2:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	560	*target++=(uint8_t)(diff>>8);
				561	*offsets++=sourceIndex;
				562	/* case 1: handled above */
				563	*target++=(uint8_t)diff;
				564	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	565	U_FALLTHROUGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	566	default:
				567	/* will never occur */
				568	break;
				569	}
				570	targetCapacity-=length;
				571	sourceIndex=nextSourceIndex;
				572	} else {
				573	uint8_t *charErrorBuffer;
				574
				575	/*
				576	* We actually do this backwards here:
				577	* In order to save an intermediate variable, we output
				578	* first to the overflow buffer what does not fit into the
				579	* regular target.
				580	*/
				581	/* we know that 1<=targetCapacity<length<=4 */
				582	length-=targetCapacity;
				583	charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
				584	switch(length) {
				585	/* each branch falls through to the next one */
				586	case 3:
				587	*charErrorBuffer++=(uint8_t)(diff>>16);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	588	U_FALLTHROUGH;
				589	case 2:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	590	*charErrorBuffer++=(uint8_t)(diff>>8);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	591	U_FALLTHROUGH;
				592	case 1:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	593	*charErrorBuffer=(uint8_t)diff;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	594	U_FALLTHROUGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	595	default:
				596	/* will never occur */
				597	break;
				598	}
				599	cnv->charErrorBufferLength=(int8_t)length;
				600
				601	/* now output what fits into the regular target */
				602	diff>>=8length; / length was reduced by targetCapacity */
				603	switch(targetCapacity) {
				604	/* each branch falls through to the next one */
				605	case 3:
				606	*target++=(uint8_t)(diff>>16);
				607	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	608	U_FALLTHROUGH;
				609	case 2:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	610	*target++=(uint8_t)(diff>>8);
				611	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	612	U_FALLTHROUGH;
				613	case 1:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	614	*target++=(uint8_t)diff;
				615	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	616	U_FALLTHROUGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	617	default:
				618	/* will never occur */
				619	break;
				620	}
				621
				622	/* target overflow */
				623	targetCapacity=0;
				624	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				625	break;
				626	}
				627	}
				628	} else {
				629	/* target is full */
				630	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				631	break;
				632	}
				633	}
				634
				635	/* set the converter state back into UConverter */
				636	cnv->fromUChar32= c<0 ? -c : 0;
				637	cnv->fromUnicodeStatus=(uint32_t)prev;
				638
				639	/* write back the updated pointers */
				640	pArgs->source=source;
				641	pArgs->target=(char *)target;
				642	pArgs->offsets=offsets;
				643	}
				644
				645	/*
				646	* Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
				647	* If a change is made in the original function, then either
				648	* change this function the same way or
				649	* re-copy the original function and remove the variables
				650	* offsets, sourceIndex, and nextSourceIndex.
				651	*/
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	652	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	653	_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
				654	UErrorCode *pErrorCode) {
				655	UConverter *cnv;
				656	const UChar source, sourceLimit;
				657	uint8_t *target;
				658	int32_t targetCapacity;
				659
				660	int32_t prev, c, diff;
				661
				662	/* set up the local pointers */
				663	cnv=pArgs->converter;
				664	source=pArgs->source;
				665	sourceLimit=pArgs->sourceLimit;
				666	target=(uint8_t *)pArgs->target;
				667	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
				668
				669	/* get the converter state from UConverter */
				670	c=cnv->fromUChar32;
				671	prev=(int32_t)cnv->fromUnicodeStatus;
				672	if(prev==0) {
				673	prev=BOCU1_ASCII_PREV;
				674	}
				675
				676	/* conversion loop */
				677	if(c!=0 && targetCapacity>0) {
				678	goto getTrail;
				679	}
				680
				681	fastSingle:
				682	/* fast loop for single-byte differences */
				683	/* use only one loop counter variable, targetCapacity, not also source */
				684	diff=(int32_t)(sourceLimit-source);
				685	if(targetCapacity>diff) {
				686	targetCapacity=diff;
				687	}
				688	while(targetCapacity>0 && (c=*source)<0x3000) {
				689	if(c<=0x20) {
				690	if(c!=0x20) {
				691	prev=BOCU1_ASCII_PREV;
				692	}
				693	*target++=(uint8_t)c;
				694	} else {
				695	diff=c-prev;
				696	if(DIFF_IS_SINGLE(diff)) {
				697	prev=BOCU1_SIMPLE_PREV(c);
				698	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
				699	} else {
				700	break;
				701	}
				702	}
				703	++source;
				704	--targetCapacity;
				705	}
				706	/* restore real values */
				707	targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
				708
				709	/* regular loop for all cases */
				710	while(source<sourceLimit) {
				711	if(targetCapacity>0) {
				712	c=*source++;
				713
				714	if(c<=0x20) {
				715	/*
				716	* ISO C0 control & space:
				717	* Encode directly for MIME compatibility,
				718	* and reset state except for space, to not disrupt compression.
				719	*/
				720	if(c!=0x20) {
				721	prev=BOCU1_ASCII_PREV;
				722	}
				723	*target++=(uint8_t)c;
				724	--targetCapacity;
				725	continue;
				726	}
				727
				728	if(U16_IS_LEAD(c)) {
				729	getTrail:
				730	if(source<sourceLimit) {
				731	/* test the following code unit */
				732	UChar trail=*source;
				733	if(U16_IS_TRAIL(trail)) {
				734	++source;
				735	c=U16_GET_SUPPLEMENTARY(c, trail);
				736	}
				737	} else {
				738	/* no more input */
				739	c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
				740	break;
				741	}
				742	}
				743
				744	/*
				745	* all other Unicode code points c==U+0021..U+10ffff
				746	* are encoded with the difference c-prev
				747	*
				748	* a new prev is computed from c,
				749	* placed in the middle of a 0x80-block (for most small scripts) or
				750	* in the middle of the Unihan and Hangul blocks
				751	* to statistically minimize the following difference
				752	*/
				753	diff=c-prev;
				754	prev=BOCU1_PREV(c);
				755	if(DIFF_IS_SINGLE(diff)) {
				756	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
				757	--targetCapacity;
				758	if(c<0x3000) {
				759	goto fastSingle;
				760	}
				761	} else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
				762	/* optimize 2-byte case */
				763	int32_t m;
				764
				765	if(diff>=0) {
				766	diff-=BOCU1_REACH_POS_1+1;
				767	m=diff%BOCU1_TRAIL_COUNT;
				768	diff/=BOCU1_TRAIL_COUNT;
				769	diff+=BOCU1_START_POS_2;
				770	} else {
				771	diff-=BOCU1_REACH_NEG_1;
				772	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
				773	diff+=BOCU1_START_NEG_2;
				774	}
				775	*target++=(uint8_t)diff;
				776	*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
				777	targetCapacity-=2;
				778	} else {
				779	int32_t length; /* will be 2..4 */
				780
				781	diff=packDiff(diff);
				782	length=BOCU1_LENGTH_FROM_PACKED(diff);
				783
				784	/* write the output character bytes from diff and length */
				785	/* from the first if in the loop we know that targetCapacity>0 */
				786	if(length<=targetCapacity) {
				787	switch(length) {
				788	/* each branch falls through to the next one */
				789	case 4:
				790	*target++=(uint8_t)(diff>>24);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	791	U_FALLTHROUGH;
				792	case 3:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	793	*target++=(uint8_t)(diff>>16);
				794	/* case 2: handled above */
				795	*target++=(uint8_t)(diff>>8);
				796	/* case 1: handled above */
				797	*target++=(uint8_t)diff;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	798	U_FALLTHROUGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	799	default:
				800	/* will never occur */
				801	break;
				802	}
				803	targetCapacity-=length;
				804	} else {
				805	uint8_t *charErrorBuffer;
				806
				807	/*
				808	* We actually do this backwards here:
				809	* In order to save an intermediate variable, we output
				810	* first to the overflow buffer what does not fit into the
				811	* regular target.
				812	*/
				813	/* we know that 1<=targetCapacity<length<=4 */
				814	length-=targetCapacity;
				815	charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
				816	switch(length) {
				817	/* each branch falls through to the next one */
				818	case 3:
				819	*charErrorBuffer++=(uint8_t)(diff>>16);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	820	U_FALLTHROUGH;
				821	case 2:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	822	*charErrorBuffer++=(uint8_t)(diff>>8);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	823	U_FALLTHROUGH;
				824	case 1:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	825	*charErrorBuffer=(uint8_t)diff;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	826	U_FALLTHROUGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	827	default:
				828	/* will never occur */
				829	break;
				830	}
				831	cnv->charErrorBufferLength=(int8_t)length;
				832
				833	/* now output what fits into the regular target */
				834	diff>>=8length; / length was reduced by targetCapacity */
				835	switch(targetCapacity) {
				836	/* each branch falls through to the next one */
				837	case 3:
				838	*target++=(uint8_t)(diff>>16);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	839	U_FALLTHROUGH;
				840	case 2:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	841	*target++=(uint8_t)(diff>>8);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	842	U_FALLTHROUGH;
				843	case 1:
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	844	*target++=(uint8_t)diff;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	845	U_FALLTHROUGH;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	846	default:
				847	/* will never occur */
				848	break;
				849	}
				850
				851	/* target overflow */
				852	targetCapacity=0;
				853	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				854	break;
				855	}
				856	}
				857	} else {
				858	/* target is full */
				859	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				860	break;
				861	}
				862	}
				863
				864	/* set the converter state back into UConverter */
				865	cnv->fromUChar32= c<0 ? -c : 0;
				866	cnv->fromUnicodeStatus=(uint32_t)prev;
				867
				868	/* write back the updated pointers */
				869	pArgs->source=source;
				870	pArgs->target=(char *)target;
				871	}
				872
				873	/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
				874
				875	/**
				876	* Function for BOCU-1 decoder; handles multi-byte lead bytes.
				877	*
				878	* @param b lead byte;
				879	* BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
				880	* @return (diff<<2)\|count
				881	*/
				882	static inline int32_t
				883	decodeBocu1LeadByte(int32_t b) {
				884	int32_t diff, count;
				885
				886	if(b>=BOCU1_START_NEG_2) {
				887	/* positive difference */
				888	if(b<BOCU1_START_POS_3) {
				889	/* two bytes */
				890	diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
				891	count=1;
				892	} else if(b<BOCU1_START_POS_4) {
				893	/* three bytes */
				894	diff=((int32_t)b-BOCU1_START_POS_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
				895	count=2;
				896	} else {
				897	/* four bytes */
				898	diff=BOCU1_REACH_POS_3+1;
				899	count=3;
				900	}
				901	} else {
				902	/* negative difference */
				903	if(b>=BOCU1_START_NEG_3) {
				904	/* two bytes */
				905	diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
				906	count=1;
				907	} else if(b>BOCU1_MIN) {
				908	/* three bytes */
				909	diff=((int32_t)b-BOCU1_START_NEG_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
				910	count=2;
				911	} else {
				912	/* four bytes */
				913	diff=-BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
				914	count=3;
				915	}
				916	}
				917
				918	/* return the state for decoding the trail byte(s) */
				919	return (diff<<2)\|count;
				920	}
				921
				922	/**
				923	* Function for BOCU-1 decoder; handles multi-byte trail bytes.
				924	*
				925	* @param count number of remaining trail bytes including this one
				926	* @param b trail byte
				927	* @return new delta for diff including b - <0 indicates an error
				928	*
				929	* @see decodeBocu1
				930	*/
				931	static inline int32_t
				932	decodeBocu1TrailByte(int32_t count, int32_t b) {
				933	if(b<=0x20) {
				934	/* skip some C0 controls and make the trail byte range contiguous */
				935	b=bocu1ByteToTrail[b];
				936	/* b<0 for an illegal trail byte value will result in return<0 below */
				937	#if BOCU1_MAX_TRAIL<0xff
				938	} else if(b>BOCU1_MAX_TRAIL) {
				939	return -99;
				940	#endif
				941	} else {
				942	b-=BOCU1_TRAIL_BYTE_OFFSET;
				943	}
				944
				945	/* add trail byte into difference and decrement count */
				946	if(count==1) {
				947	return b;
				948	} else if(count==2) {
				949	return b*BOCU1_TRAIL_COUNT;
				950	} else /* count==3 */ {
				951	return b(BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT);
				952	}
				953	}
				954
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	955	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	956	_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				957	UErrorCode *pErrorCode) {
				958	UConverter *cnv;
				959	const uint8_t source, sourceLimit;
				960	UChar *target;
				961	const UChar *targetLimit;
				962	int32_t *offsets;
				963
				964	int32_t prev, count, diff, c;
				965
				966	int8_t byteIndex;
				967	uint8_t *bytes;
				968
				969	int32_t sourceIndex, nextSourceIndex;
				970
				971	/* set up the local pointers */
				972	cnv=pArgs->converter;
				973	source=(const uint8_t *)pArgs->source;
				974	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				975	target=pArgs->target;
				976	targetLimit=pArgs->targetLimit;
				977	offsets=pArgs->offsets;
				978
				979	/* get the converter state from UConverter */
				980	prev=(int32_t)cnv->toUnicodeStatus;
				981	if(prev==0) {
				982	prev=BOCU1_ASCII_PREV;
				983	}
				984	diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
				985	count=diff&3;
				986	diff>>=2;
				987
				988	byteIndex=cnv->toULength;
				989	bytes=cnv->toUBytes;
				990
				991	/* sourceIndex=-1 if the current character began in the previous buffer */
				992	sourceIndex=byteIndex==0 ? 0 : -1;
				993	nextSourceIndex=0;
				994
				995	/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
				996	if(count>0 && byteIndex>0 && target<targetLimit) {
				997	goto getTrail;
				998	}
				999
				1000	fastSingle:
				1001	/* fast loop for single-byte differences */
				1002	/* use count as the only loop counter variable */
				1003	diff=(int32_t)(sourceLimit-source);
				1004	count=(int32_t)(pArgs->targetLimit-target);
				1005	if(count>diff) {
				1006	count=diff;
				1007	}
				1008	while(count>0) {
				1009	if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
				1010	c=prev+(c-BOCU1_MIDDLE);
				1011	if(c<0x3000) {
				1012	*target++=(UChar)c;
				1013	*offsets++=nextSourceIndex++;
				1014	prev=BOCU1_SIMPLE_PREV(c);
				1015	} else {
				1016	break;
				1017	}
				1018	} else if(c<=0x20) {
				1019	if(c!=0x20) {
				1020	prev=BOCU1_ASCII_PREV;
				1021	}
				1022	*target++=(UChar)c;
				1023	*offsets++=nextSourceIndex++;
				1024	} else {
				1025	break;
				1026	}
				1027	++source;
				1028	--count;
				1029	}
				1030	sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
				1031
				1032	/* decode a sequence of single and lead bytes */
				1033	while(source<sourceLimit) {
				1034	if(target>=targetLimit) {
				1035	/* target is full */
				1036	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1037	break;
				1038	}
				1039
				1040	++nextSourceIndex;
				1041	c=*source++;
				1042	if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
				1043	/* Write a code point directly from a single-byte difference. */
				1044	c=prev+(c-BOCU1_MIDDLE);
				1045	if(c<0x3000) {
				1046	*target++=(UChar)c;
				1047	*offsets++=sourceIndex;
				1048	prev=BOCU1_SIMPLE_PREV(c);
				1049	sourceIndex=nextSourceIndex;
				1050	goto fastSingle;
				1051	}
				1052	} else if(c<=0x20) {
				1053	/*
				1054	* Direct-encoded C0 control code or space.
				1055	* Reset prev for C0 control codes but not for space.
				1056	*/
				1057	if(c!=0x20) {
				1058	prev=BOCU1_ASCII_PREV;
				1059	}
				1060	*target++=(UChar)c;
				1061	*offsets++=sourceIndex;
				1062	sourceIndex=nextSourceIndex;
				1063	continue;
				1064	} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
				1065	/* Optimize two-byte case. */
				1066	if(c>=BOCU1_MIDDLE) {
				1067	diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
				1068	} else {
				1069	diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
				1070	}
				1071
				1072	/* trail byte */
				1073	++nextSourceIndex;
				1074	c=decodeBocu1TrailByte(1, *source++);
				1075	if(c<0 \|\| (uint32_t)(c=prev+diff+c)>0x10ffff) {
				1076	bytes[0]=source[-2];
				1077	bytes[1]=source[-1];
				1078	byteIndex=2;
				1079	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1080	break;
				1081	}
				1082	} else if(c==BOCU1_RESET) {
				1083	/* only reset the state, no code point */
				1084	prev=BOCU1_ASCII_PREV;
				1085	sourceIndex=nextSourceIndex;
				1086	continue;
				1087	} else {
				1088	/*
				1089	* For multi-byte difference lead bytes, set the decoder state
				1090	* with the partial difference value from the lead byte and
				1091	* with the number of trail bytes.
				1092	*/
				1093	bytes[0]=(uint8_t)c;
				1094	byteIndex=1;
				1095
				1096	diff=decodeBocu1LeadByte(c);
				1097	count=diff&3;
				1098	diff>>=2;
				1099	getTrail:
				1100	for(;;) {
				1101	if(source>=sourceLimit) {
				1102	goto endloop;
				1103	}
				1104	++nextSourceIndex;
				1105	c=bytes[byteIndex++]=*source++;
				1106
				1107	/* trail byte in any position */
				1108	c=decodeBocu1TrailByte(count, c);
				1109	if(c<0) {
				1110	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1111	goto endloop;
				1112	}
				1113
				1114	diff+=c;
				1115	if(--count==0) {
				1116	/* final trail byte, deliver a code point */
				1117	byteIndex=0;
				1118	c=prev+diff;
				1119	if((uint32_t)c>0x10ffff) {
				1120	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1121	goto endloop;
				1122	}
				1123	break;
				1124	}
				1125	}
				1126	}
				1127
				1128	/* calculate the next prev and output c */
				1129	prev=BOCU1_PREV(c);
				1130	if(c<=0xffff) {
				1131	*target++=(UChar)c;
				1132	*offsets++=sourceIndex;
				1133	} else {
				1134	/* output surrogate pair */
				1135	*target++=U16_LEAD(c);
				1136	if(target<targetLimit) {
				1137	*target++=U16_TRAIL(c);
				1138	*offsets++=sourceIndex;
				1139	*offsets++=sourceIndex;
				1140	} else {
				1141	/* target overflow */
				1142	*offsets++=sourceIndex;
				1143	cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
				1144	cnv->UCharErrorBufferLength=1;
				1145	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1146	break;
				1147	}
				1148	}
				1149	sourceIndex=nextSourceIndex;
				1150	}
				1151	endloop:
				1152
				1153	if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
				1154	/* set the converter state in UConverter to deal with the next character */
				1155	cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
				1156	cnv->mode=0;
				1157	} else {
				1158	/* set the converter state back into UConverter */
				1159	cnv->toUnicodeStatus=(uint32_t)prev;
				1160	cnv->mode=(diff<<2)\|count;
				1161	}
				1162	cnv->toULength=byteIndex;
				1163
				1164	/* write back the updated pointers */
				1165	pArgs->source=(const char *)source;
				1166	pArgs->target=target;
				1167	pArgs->offsets=offsets;
				1168	return;
				1169	}
				1170
				1171	/*
				1172	* Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
				1173	* If a change is made in the original function, then either
				1174	* change this function the same way or
				1175	* re-copy the original function and remove the variables
				1176	* offsets, sourceIndex, and nextSourceIndex.
				1177	*/
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1178	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1179	_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
				1180	UErrorCode *pErrorCode) {
				1181	UConverter *cnv;
				1182	const uint8_t source, sourceLimit;
				1183	UChar *target;
				1184	const UChar *targetLimit;
				1185
				1186	int32_t prev, count, diff, c;
				1187
				1188	int8_t byteIndex;
				1189	uint8_t *bytes;
				1190
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1191	/* set up the local pointers */
				1192	cnv=pArgs->converter;
				1193	source=(const uint8_t *)pArgs->source;
				1194	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				1195	target=pArgs->target;
				1196	targetLimit=pArgs->targetLimit;
				1197
				1198	/* get the converter state from UConverter */
				1199	prev=(int32_t)cnv->toUnicodeStatus;
				1200	if(prev==0) {
				1201	prev=BOCU1_ASCII_PREV;
				1202	}
				1203	diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
				1204	count=diff&3;
				1205	diff>>=2;
				1206
				1207	byteIndex=cnv->toULength;
				1208	bytes=cnv->toUBytes;
				1209
				1210	/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
				1211	if(count>0 && byteIndex>0 && target<targetLimit) {
				1212	goto getTrail;
				1213	}
				1214
				1215	fastSingle:
				1216	/* fast loop for single-byte differences */
				1217	/* use count as the only loop counter variable */
				1218	diff=(int32_t)(sourceLimit-source);
				1219	count=(int32_t)(pArgs->targetLimit-target);
				1220	if(count>diff) {
				1221	count=diff;
				1222	}
				1223	while(count>0) {
				1224	if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
				1225	c=prev+(c-BOCU1_MIDDLE);
				1226	if(c<0x3000) {
				1227	*target++=(UChar)c;
				1228	prev=BOCU1_SIMPLE_PREV(c);
				1229	} else {
				1230	break;
				1231	}
				1232	} else if(c<=0x20) {
				1233	if(c!=0x20) {
				1234	prev=BOCU1_ASCII_PREV;
				1235	}
				1236	*target++=(UChar)c;
				1237	} else {
				1238	break;
				1239	}
				1240	++source;
				1241	--count;
				1242	}
				1243
				1244	/* decode a sequence of single and lead bytes */
				1245	while(source<sourceLimit) {
				1246	if(target>=targetLimit) {
				1247	/* target is full */
				1248	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1249	break;
				1250	}
				1251
				1252	c=*source++;
				1253	if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
				1254	/* Write a code point directly from a single-byte difference. */
				1255	c=prev+(c-BOCU1_MIDDLE);
				1256	if(c<0x3000) {
				1257	*target++=(UChar)c;
				1258	prev=BOCU1_SIMPLE_PREV(c);
				1259	goto fastSingle;
				1260	}
				1261	} else if(c<=0x20) {
				1262	/*
				1263	* Direct-encoded C0 control code or space.
				1264	* Reset prev for C0 control codes but not for space.
				1265	*/
				1266	if(c!=0x20) {
				1267	prev=BOCU1_ASCII_PREV;
				1268	}
				1269	*target++=(UChar)c;
				1270	continue;
				1271	} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
				1272	/* Optimize two-byte case. */
				1273	if(c>=BOCU1_MIDDLE) {
				1274	diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
				1275	} else {
				1276	diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
				1277	}
				1278
				1279	/* trail byte */
				1280	c=decodeBocu1TrailByte(1, *source++);
				1281	if(c<0 \|\| (uint32_t)(c=prev+diff+c)>0x10ffff) {
				1282	bytes[0]=source[-2];
				1283	bytes[1]=source[-1];
				1284	byteIndex=2;
				1285	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1286	break;
				1287	}
				1288	} else if(c==BOCU1_RESET) {
				1289	/* only reset the state, no code point */
				1290	prev=BOCU1_ASCII_PREV;
				1291	continue;
				1292	} else {
				1293	/*
				1294	* For multi-byte difference lead bytes, set the decoder state
				1295	* with the partial difference value from the lead byte and
				1296	* with the number of trail bytes.
				1297	*/
				1298	bytes[0]=(uint8_t)c;
				1299	byteIndex=1;
				1300
				1301	diff=decodeBocu1LeadByte(c);
				1302	count=diff&3;
				1303	diff>>=2;
				1304	getTrail:
				1305	for(;;) {
				1306	if(source>=sourceLimit) {
				1307	goto endloop;
				1308	}
				1309	c=bytes[byteIndex++]=*source++;
				1310
				1311	/* trail byte in any position */
				1312	c=decodeBocu1TrailByte(count, c);
				1313	if(c<0) {
				1314	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1315	goto endloop;
				1316	}
				1317
				1318	diff+=c;
				1319	if(--count==0) {
				1320	/* final trail byte, deliver a code point */
				1321	byteIndex=0;
				1322	c=prev+diff;
				1323	if((uint32_t)c>0x10ffff) {
				1324	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1325	goto endloop;
				1326	}
				1327	break;
				1328	}
				1329	}
				1330	}
				1331
				1332	/* calculate the next prev and output c */
				1333	prev=BOCU1_PREV(c);
				1334	if(c<=0xffff) {
				1335	*target++=(UChar)c;
				1336	} else {
				1337	/* output surrogate pair */
				1338	*target++=U16_LEAD(c);
				1339	if(target<targetLimit) {
				1340	*target++=U16_TRAIL(c);
				1341	} else {
				1342	/* target overflow */
				1343	cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
				1344	cnv->UCharErrorBufferLength=1;
				1345	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1346	break;
				1347	}
				1348	}
				1349	}
				1350	endloop:
				1351
				1352	if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
				1353	/* set the converter state in UConverter to deal with the next character */
				1354	cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
				1355	cnv->mode=0;
				1356	} else {
				1357	/* set the converter state back into UConverter */
				1358	cnv->toUnicodeStatus=(uint32_t)prev;
				1359	cnv->mode=(diff<<2)\|count;
				1360	}
				1361	cnv->toULength=byteIndex;
				1362
				1363	/* write back the updated pointers */
				1364	pArgs->source=(const char *)source;
				1365	pArgs->target=target;
				1366	return;
				1367	}
				1368
				1369	/* miscellaneous ------------------------------------------------------------ */
				1370
				1371	static const UConverterImpl _Bocu1Impl={
				1372	UCNV_BOCU1,
				1373
				1374	NULL,
				1375	NULL,
				1376
				1377	NULL,
				1378	NULL,
				1379	NULL,
				1380
				1381	_Bocu1ToUnicode,
				1382	_Bocu1ToUnicodeWithOffsets,
				1383	_Bocu1FromUnicode,
				1384	_Bocu1FromUnicodeWithOffsets,
				1385	NULL,
				1386
				1387	NULL,
				1388	NULL,
				1389	NULL,
				1390	NULL,
				1391	ucnv_getCompleteUnicodeSet,
				1392
				1393	NULL,
				1394	NULL
				1395	};
				1396
				1397	static const UConverterStaticData _Bocu1StaticData={
				1398	sizeof(UConverterStaticData),
				1399	"BOCU-1",
				1400	1214, /* CCSID for BOCU-1 */
				1401	UCNV_IBM, UCNV_BOCU1,
				1402	1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
				1403	{ 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	1404	false, false,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1405	0,
				1406	0,
				1407	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1408	};
				1409
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	1410	const UConverterSharedData _Bocu1Data=
				1411	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1412
				1413	#endif