Blame - source/common/ucnvmbcs.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 4b36cc605b16c778f2c63406b1062e4f23a7a112 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	3	/*
				4	******************************************************************************
				5	*
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	6	* Copyright (C) 2000-2016, International Business Machines
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	7	* Corporation and others. All Rights Reserved.
				8	*
				9	******************************************************************************
				10	* file name: ucnvmbcs.cpp
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	11	* encoding: UTF-8
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	12	* tab size: 8 (not used)
				13	* indentation:4
				14	*
				15	* created on: 2000jul03
				16	* created by: Markus W. Scherer
				17	*
				18	* The current code in this file replaces the previous implementation
				19	* of conversion code from multi-byte codepages to Unicode and back.
				20	* This implementation supports the following:
				21	* - legacy variable-length codepages with up to 4 bytes per character
				22	* - all Unicode code points (up to 0x10ffff)
				23	* - efficient distinction of unassigned vs. illegal byte sequences
				24	* - it is possible in fromUnicode() to directly deal with simple
				25	* stateful encodings (used for EBCDIC_STATEFUL)
				26	* - it is possible to convert Unicode code points
				27	* to a single zero byte (but not as a fallback except for SBCS)
				28	*
				29	* Remaining limitations in fromUnicode:
				30	* - byte sequences must not have leading zero bytes
				31	* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
				32	* - limitation to up to 4 bytes per character
				33	*
				34	* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
				35	* limitations and adds m:n character mappings and other features.
				36	* See ucnv_ext.h for details.
				37	*
				38	* Change history:
				39	*
				40	* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
				41	* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
				42	* macros to ucnvmbcs.h file
				43	*/
				44
				45	#include "unicode/utypes.h"
				46
				47	#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
				48
				49	#include "unicode/ucnv.h"
				50	#include "unicode/ucnv_cb.h"
				51	#include "unicode/udata.h"
				52	#include "unicode/uset.h"
				53	#include "unicode/utf8.h"
				54	#include "unicode/utf16.h"
				55	#include "ucnv_bld.h"
				56	#include "ucnvmbcs.h"
				57	#include "ucnv_ext.h"
				58	#include "ucnv_cnv.h"
				59	#include "cmemory.h"
				60	#include "cstring.h"
				61	#include "umutex.h"
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	62	#include "ustr_imp.h"
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	63
				64	/* control optimizations according to the platform */
				65	#define MBCS_UNROLL_SINGLE_TO_BMP 1
				66	#define MBCS_UNROLL_SINGLE_FROM_BMP 0
				67
				68	/*
				69	* _MBCSHeader versions 5.3 & 4.3
				70	* (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
				71	*
				72	* This version is optional. Version 5 is used for incompatible data format changes.
				73	* makeconv will continue to generate version 4 files if possible.
				74	*
				75	* Changes from version 4:
				76	*
				77	* The main difference is an additional _MBCSHeader field with
				78	* - the length (number of uint32_t) of the _MBCSHeader
				79	* - flags for further incompatible data format changes
				80	* - flags for further, backward compatible data format changes
				81	*
				82	* The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
				83	* the file and needs to be reconstituted at load time.
				84	* This requires a utf8Friendly format with an additional mbcsIndex table for fast
				85	* (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
				86	* (For details about these structures see below, and see ucnvmbcs.h.)
				87	*
				88	* utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
				89	* of the Unicode code points. (This requires that the .ucm file has the \|0 etc.
				90	* precision markers for all mappings.)
				91	*
				92	* All fallbacks have been moved to the extension table, leaving only roundtrips in the
				93	* omitted data that can be reconstituted from the toUnicode data.
				94	*
				95	* Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
				96	* With only roundtrip mappings in the base fromUnicode data, this part is fully
				97	* redundant with the mbcsIndex and will be reconstituted from that (also using the
				98	* stage 1 table which contains the information about how stage 2 was compacted).
				99	*
				100	* The rest of the stage 2 table, the part for code points above maxFastUChar,
				101	* is stored in the file and will be appended to the reconstituted part.
				102	*
				103	* The entire fromUBytes array is omitted from the file and will be reconstitued.
				104	* This is done by enumerating all toUnicode roundtrip mappings, performing
				105	* each mapping (using the stage 1 and reconstituted stage 2 tables) and
				106	* writing instead of reading the byte values.
				107	*
				108	* _MBCSHeader version 4.3
				109	*
				110	* Change from version 4.2:
				111	* - Optional utf8Friendly data structures, with 64-entry stage 3 block
				112	* allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
				113	* files which can be used instead of stages 1 & 2.
				114	* Faster lookups for roundtrips from most commonly used characters,
				115	* and lookups from UTF-8 byte sequences with a natural bit distribution.
				116	* See ucnvmbcs.h for more details.
				117	*
				118	* Change from version 4.1:
				119	* - Added an optional extension table structure at the end of the .cnv file.
				120	* It is present if the upper bits of the header flags field contains a non-zero
				121	* byte offset to it.
				122	* Files that contain only a conversion table and no base table
				123	* use the special outputType MBCS_OUTPUT_EXT_ONLY.
				124	* These contain the base table name between the MBCS header and the extension
				125	* data.
				126	*
				127	* Change from version 4.0:
				128	* - Replace header.reserved with header.fromUBytesLength so that all
				129	* fields in the data have length.
				130	*
				131	* Changes from version 3 (for performance improvements):
				132	* - new bit distribution for state table entries
				133	* - reordered action codes
				134	* - new data structure for single-byte fromUnicode
				135	* + stage 2 only contains indexes
				136	* + stage 3 stores 16 bits per character with classification bits 15..8
				137	* - no multiplier for stage 1 entries
				138	* - stage 2 for non-single-byte codepages contains the index and the flags in
				139	* one 32-bit value
				140	* - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
				141	*
				142	* For more details about old versions of the MBCS data structure, see
				143	* the corresponding versions of this file.
				144	*
				145	* Converting stateless codepage data ---------------------------------------***
				146	* (or codepage data with simple states) to Unicode.
				147	*
				148	* Data structure and algorithm for converting from complex legacy codepages
				149	* to Unicode. (Designed before 2000-may-22.)
				150	*
				151	* The basic idea is that the structure of legacy codepages can be described
				152	* with state tables.
				153	* When reading a byte stream, each input byte causes a state transition.
				154	* Some transitions result in the output of a code point, some result in
				155	* "unassigned" or "illegal" output.
				156	* This is used here for character conversion.
				157	*
				158	* The data structure begins with a state table consisting of a row
				159	* per state, with 256 entries (columns) per row for each possible input
				160	* byte value.
				161	* Each entry is 32 bits wide, with two formats distinguished by
				162	* the sign bit (bit 31):
				163	*
				164	* One format for transitional entries (bit 31 not set) for non-final bytes, and
				165	* one format for final entries (bit 31 set).
				166	* Both formats contain the number of the next state in the same bit
				167	* positions.
				168	* State 0 is the initial state.
				169	*
				170	* Most of the time, the offset values of subsequent states are added
				171	* up to a scalar value. This value will eventually be the index of
				172	* the Unicode code point in a table that follows the state table.
				173	* The effect is that the code points for final state table rows
				174	* are contiguous. The code points of final state rows follow each other
				175	* in the order of the references to those final states by previous
				176	* states, etc.
				177	*
				178	* For some terminal states, the offset is itself the output Unicode
				179	* code point (16 bits for a BMP code point or 20 bits for a supplementary
				180	* code point (stored as code point minus 0x10000 so that 20 bits are enough).
				181	* For others, the code point in the Unicode table is stored with either
				182	* one or two code units: one for BMP code points, two for a pair of
				183	* surrogates.
				184	* All code points for a final state entry take up the same number of code
				185	* units, regardless of whether they all actually _use_ the same number
				186	* of code units. This is necessary for simple array access.
				187	*
				188	* An additional feature comes in with what in ICU is called "fallback"
				189	* mappings:
				190	*
				191	* In addition to round-trippable, precise, 1:1 mappings, there are often
				192	* mappings defined between similar, though not the same, characters.
				193	* Typically, such mappings occur only in fromUnicode mapping tables because
				194	* Unicode has a superset repertoire of most other codepages. However, it
				195	* is possible to provide such mappings in the toUnicode tables, too.
				196	* In this case, the fallback mappings are partly integrated into the
				197	* general state tables because the structure of the encoding includes their
				198	* byte sequences.
				199	* For final entries in an initial state, fallback mappings are stored in
				200	* the entry itself like with roundtrip mappings.
				201	* For other final entries, they are stored in the code units table if
				202	* the entry is for a pair of code units.
				203	* For single-unit results in the code units table, there is no space to
				204	* alternatively hold a fallback mapping; in this case, the code unit
				205	* is stored as U+fffe (unassigned), and the fallback mapping needs to
				206	* be looked up by the scalar offset value in a separate table.
				207	*
				208	* "Unassigned" state entries really mean "structurally unassigned",
				209	* i.e., such a byte sequence will never have a mapping result.
				210	*
				211	* The interpretation of the bits in each entry is as follows:
				212	*
				213	* Bit 31 not set, not a terminal entry ("transitional"):
				214	* 30..24 next state
				215	* 23..0 offset delta, to be added up
				216	*
				217	* Bit 31 set, terminal ("final") entry:
				218	* 30..24 next state (regardless of action code)
				219	* 23..20 action code:
				220	* action codes 0 and 1 result in precise-mapping Unicode code points
				221	* 0 valid byte sequence
				222	* 19..16 not used, 0
				223	* 15..0 16-bit Unicode BMP code point
				224	* never U+fffe or U+ffff
				225	* 1 valid byte sequence
				226	* 19..0 20-bit Unicode supplementary code point
				227	* never U+fffe or U+ffff
				228	*
				229	* action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
				230	* 2 valid byte sequence (fallback)
				231	* 19..16 not used, 0
				232	* 15..0 16-bit Unicode BMP code point as fallback result
				233	* 3 valid byte sequence (fallback)
				234	* 19..0 20-bit Unicode supplementary code point as fallback result
				235	*
				236	* action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
				237	* depending on the code units they result in
				238	* 4 valid byte sequence
				239	* 19..9 not used, 0
				240	* 8..0 final offset delta
				241	* pointing to one 16-bit code unit which may be
				242	* fffe unassigned -- look for a fallback for this offset
				243	* ffff illegal
				244	* 5 valid byte sequence
				245	* 19..9 not used, 0
				246	* 8..0 final offset delta
				247	* pointing to two 16-bit code units
				248	* (typically UTF-16 surrogates)
				249	* the result depends on the first code unit as follows:
				250	* 0000..d7ff roundtrip BMP code point (1st alone)
				251	* d800..dbff roundtrip surrogate pair (1st, 2nd)
				252	* dc00..dfff fallback surrogate pair (1st-400, 2nd)
				253	* e000 roundtrip BMP code point (2nd alone)
				254	* e001 fallback BMP code point (2nd alone)
				255	* fffe unassigned
				256	* ffff illegal
				257	* (the final offset deltas are at most 255 * 2,
				258	* times 2 because of storing code unit pairs)
				259	*
				260	* 6 unassigned byte sequence
				261	* 19..16 not used, 0
				262	* 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
				263	* this does not contain a final offset delta because the main
				264	* purpose of this action code is to save scalar offset values;
				265	* therefore, fallback values cannot be assigned to byte
				266	* sequences that result in this action code
				267	* 7 illegal byte sequence
				268	* 19..16 not used, 0
				269	* 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
				270	* 8 state change only
				271	* 19..0 not used, 0
				272	* useful for state changes in simple stateful encodings,
				273	* at Shift-In/Shift-Out codes
				274	*
				275	*
				276	* 9..15 reserved for future use
				277	* current implementations will only perform a state change
				278	* and ignore bits 19..0
				279	*
				280	* An encoding with contiguous ranges of unassigned byte sequences, like
				281	* Shift-JIS and especially EUC-TW, can be stored efficiently by having
				282	* at least two states for the trail bytes:
				283	* One trail byte state that results in code points, and one that only
				284	* has "unassigned" and "illegal" terminal states.
				285	*
				286	* Note: partly by accident, this data structure supports simple stateful
				287	* encodings without any additional logic.
				288	* Currently, only simple Shift-In/Shift-Out schemes are handled with
				289	* appropriate state tables (especially EBCDIC_STATEFUL!).
				290	*
				291	* MBCS version 2 added:
				292	* unassigned and illegal action codes have U+fffe and U+ffff
				293	* instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
				294	*
				295	* Converting from Unicode to codepage bytes --------------------------------***
				296	*
				297	* The conversion data structure for fromUnicode is designed for the known
				298	* structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
				299	* a sequence of 1..4 bytes, in addition to a flag that indicates if there is
				300	* a roundtrip mapping.
				301	*
				302	* The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
				303	* like in the character properties table.
				304	* The beginning of the trie is at offsetFromUTable, the beginning of stage 3
				305	* with the resulting bytes is at offsetFromUBytes.
				306	*
				307	* Beginning with version 4, single-byte codepages have a significantly different
				308	* trie compared to other codepages.
				309	* In all cases, the entry in stage 1 is directly the index of the block of
				310	* 64 entries in stage 2.
				311	*
				312	* Single-byte lookup:
				313	*
				314	* Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
				315	* Stage 3 contains one 16-bit word per result:
				316	* Bits 15..8 indicate the kind of result:
				317	* f roundtrip result
				318	* c fallback result from private-use code point
				319	* 8 fallback result from other code points
				320	* 0 unassigned
				321	* Bits 7..0 contain the codepage byte. A zero byte is always possible.
				322	*
				323	* In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
				324	* file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
				325	* becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
				326	* ASCII code points can be looked up with a linear array access into stage 3.
				327	* See maxFastUChar and other details in ucnvmbcs.h.
				328	*
				329	* Multi-byte lookup:
				330	*
				331	* Stage 2 contains a 32-bit word for each 16-block in stage 3:
				332	* Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
				333	* test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
				334	* If this test is false, then a non-zero result will be interpreted as
				335	* a fallback mapping.
				336	* Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
				337	*
				338	* Stage 3 contains 2, 3, or 4 bytes per result.
				339	* 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
				340	* while 3 bytes are stored as bytes in big-endian order.
				341	* Leading zero bytes are ignored, and the number of bytes is counted.
				342	* A zero byte mapping result is possible as a roundtrip result.
				343	* For some output types, the actual result is processed from this;
				344	* see ucnv_MBCSFromUnicodeWithOffsets().
				345	*
				346	* Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
				347	* or (version 3 and up) for BMP-only codepages, it contains 64 entries.
				348	*
				349	* In version 4.3, a utf8Friendly file contains an mbcsIndex table.
				350	* For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
				351	* becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
				352	* ASCII code points can be looked up with a linear array access into stage 3.
				353	* See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
				354	*
				355	* In version 3, stage 2 blocks may overlap by multiples of the multiplier
				356	* for compaction.
				357	* In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
				358	* may overlap by any number of entries.
				359	*
				360	* MBCS version 2 added:
				361	* the converter checks for known output types, which allows
				362	* adding new ones without crashing an unaware converter
				363	*/
				364
				365	/**
				366	* Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
				367	* consecutive sequences of bytes, starting from the one encoded in value,
				368	* to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
				369	* Does not currently support m:n mappings or reverse fallbacks.
				370	* This function will not be called for sequences of bytes with leading zeros.
				371	*
				372	* @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
				373	* @param value contains 1..4 bytes of the first byte sequence, right-aligned
				374	* @param codePoints resulting Unicode code points, or negative if a byte sequence does
				375	* not map to anything
				376	* @return TRUE to continue enumeration, FALSE to stop
				377	*/
				378	typedef UBool U_CALLCONV
				379	UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
				380
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	381	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	382	ucnv_MBCSLoad(UConverterSharedData *sharedData,
				383	UConverterLoadArgs *pArgs,
				384	const uint8_t *raw,
				385	UErrorCode *pErrorCode);
				386
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	387	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	388	ucnv_MBCSUnload(UConverterSharedData *sharedData);
				389
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	390	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	391	ucnv_MBCSOpen(UConverter *cnv,
				392	UConverterLoadArgs *pArgs,
				393	UErrorCode *pErrorCode);
				394
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	395	static UChar32 U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	396	ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
				397	UErrorCode *pErrorCode);
				398
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	399	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	400	ucnv_MBCSGetStarters(const UConverter* cnv,
				401	UBool starters[256],
				402	UErrorCode *pErrorCode);
				403
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	404	U_CDECL_BEGIN
				405	static const char* U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	406	ucnv_MBCSGetName(const UConverter *cnv);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	407	U_CDECL_END
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	408
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	409	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	410	ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
				411	int32_t offsetIndex,
				412	UErrorCode *pErrorCode);
				413
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	414	static UChar32 U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	415	ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
				416	UErrorCode *pErrorCode);
				417
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	418	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	419	ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
				420	UConverterToUnicodeArgs *pToUArgs,
				421	UErrorCode *pErrorCode);
				422
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	423	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	424	ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
				425	const USetAdder *sa,
				426	UConverterUnicodeSet which,
				427	UErrorCode *pErrorCode);
				428
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	429	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	430	ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
				431	UConverterToUnicodeArgs *pToUArgs,
				432	UErrorCode *pErrorCode);
				433
				434	static const UConverterImpl _SBCSUTF8Impl={
				435	UCNV_MBCS,
				436
				437	ucnv_MBCSLoad,
				438	ucnv_MBCSUnload,
				439
				440	ucnv_MBCSOpen,
				441	NULL,
				442	NULL,
				443
				444	ucnv_MBCSToUnicodeWithOffsets,
				445	ucnv_MBCSToUnicodeWithOffsets,
				446	ucnv_MBCSFromUnicodeWithOffsets,
				447	ucnv_MBCSFromUnicodeWithOffsets,
				448	ucnv_MBCSGetNextUChar,
				449
				450	ucnv_MBCSGetStarters,
				451	ucnv_MBCSGetName,
				452	ucnv_MBCSWriteSub,
				453	NULL,
				454	ucnv_MBCSGetUnicodeSet,
				455
				456	NULL,
				457	ucnv_SBCSFromUTF8
				458	};
				459
				460	static const UConverterImpl _DBCSUTF8Impl={
				461	UCNV_MBCS,
				462
				463	ucnv_MBCSLoad,
				464	ucnv_MBCSUnload,
				465
				466	ucnv_MBCSOpen,
				467	NULL,
				468	NULL,
				469
				470	ucnv_MBCSToUnicodeWithOffsets,
				471	ucnv_MBCSToUnicodeWithOffsets,
				472	ucnv_MBCSFromUnicodeWithOffsets,
				473	ucnv_MBCSFromUnicodeWithOffsets,
				474	ucnv_MBCSGetNextUChar,
				475
				476	ucnv_MBCSGetStarters,
				477	ucnv_MBCSGetName,
				478	ucnv_MBCSWriteSub,
				479	NULL,
				480	ucnv_MBCSGetUnicodeSet,
				481
				482	NULL,
				483	ucnv_DBCSFromUTF8
				484	};
				485
				486	static const UConverterImpl _MBCSImpl={
				487	UCNV_MBCS,
				488
				489	ucnv_MBCSLoad,
				490	ucnv_MBCSUnload,
				491
				492	ucnv_MBCSOpen,
				493	NULL,
				494	NULL,
				495
				496	ucnv_MBCSToUnicodeWithOffsets,
				497	ucnv_MBCSToUnicodeWithOffsets,
				498	ucnv_MBCSFromUnicodeWithOffsets,
				499	ucnv_MBCSFromUnicodeWithOffsets,
				500	ucnv_MBCSGetNextUChar,
				501
				502	ucnv_MBCSGetStarters,
				503	ucnv_MBCSGetName,
				504	ucnv_MBCSWriteSub,
				505	NULL,
				506	ucnv_MBCSGetUnicodeSet,
				507	NULL,
				508	NULL
				509	};
				510
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	511	/* Static data is in tools/makeconv/ucnvstat.c for data-based
				512	* converters. Be sure to update it as well.
				513	*/
				514
				515	const UConverterSharedData _MBCSData={
				516	sizeof(UConverterSharedData), 1,
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	517	NULL, NULL, FALSE, TRUE, &_MBCSImpl,
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	518	0, UCNV_MBCS_TABLE_INITIALIZER
				519	};
				520
				521
				522	/* GB 18030 data ------------------------------------------------------------ */
				523
				524	/* helper macros for linear values for GB 18030 four-byte sequences */
				525	#define LINEAR_18030(a, b, c, d) ((((a)10+(b))126L+(c))*10L+(d))
				526
				527	#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
				528
				529	#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
				530
				531	/*
				532	* Some ranges of GB 18030 where both the Unicode code points and the
				533	* GB four-byte sequences are contiguous and are handled algorithmically by
				534	* the special callback functions below.
				535	* The values are start & end of Unicode & GB codes.
				536	*
				537	* Note that single surrogates are not mapped by GB 18030
				538	* as of the re-released mapping tables from 2000-nov-30.
				539	*/
				540	static const uint32_t
				541	gb18030Ranges[14][4]={
				542	{0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
				543	{0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
				544	{0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
				545	{0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
				546	{0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
				547	{0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
				548	{0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
				549	{0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
				550	{0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
				551	{0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
				552	{0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
				553	{0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
				554	{0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
				555	{0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
				556	};
				557
				558	/* bit flag for UConverter.options indicating GB 18030 special handling */
				559	#define _MBCS_OPTION_GB18030 0x8000
				560
				561	/* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
				562	#define _MBCS_OPTION_KEIS 0x01000
				563	#define _MBCS_OPTION_JEF 0x02000
				564	#define _MBCS_OPTION_JIPS 0x04000
				565
				566	#define KEIS_SO_CHAR_1 0x0A
				567	#define KEIS_SO_CHAR_2 0x42
				568	#define KEIS_SI_CHAR_1 0x0A
				569	#define KEIS_SI_CHAR_2 0x41
				570
				571	#define JEF_SO_CHAR 0x28
				572	#define JEF_SI_CHAR 0x29
				573
				574	#define JIPS_SO_CHAR_1 0x1A
				575	#define JIPS_SO_CHAR_2 0x70
				576	#define JIPS_SI_CHAR_1 0x1A
				577	#define JIPS_SI_CHAR_2 0x71
				578
				579	enum SISO_Option {
				580	SI,
				581	SO
				582	};
				583	typedef enum SISO_Option SISO_Option;
				584
				585	static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
				586	int32_t SISOLength = 0;
				587
				588	switch (option) {
				589	case SI:
				590	if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
				591	value[0] = KEIS_SI_CHAR_1;
				592	value[1] = KEIS_SI_CHAR_2;
				593	SISOLength = 2;
				594	} else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
				595	value[0] = JEF_SI_CHAR;
				596	SISOLength = 1;
				597	} else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
				598	value[0] = JIPS_SI_CHAR_1;
				599	value[1] = JIPS_SI_CHAR_2;
				600	SISOLength = 2;
				601	} else {
				602	value[0] = UCNV_SI;
				603	SISOLength = 1;
				604	}
				605	break;
				606	case SO:
				607	if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
				608	value[0] = KEIS_SO_CHAR_1;
				609	value[1] = KEIS_SO_CHAR_2;
				610	SISOLength = 2;
				611	} else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
				612	value[0] = JEF_SO_CHAR;
				613	SISOLength = 1;
				614	} else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
				615	value[0] = JIPS_SO_CHAR_1;
				616	value[1] = JIPS_SO_CHAR_2;
				617	SISOLength = 2;
				618	} else {
				619	value[0] = UCNV_SO;
				620	SISOLength = 1;
				621	}
				622	break;
				623	default:
				624	/* Should never happen. */
				625	break;
				626	}
				627
				628	return SISOLength;
				629	}
				630
				631	/* Miscellaneous ------------------------------------------------------------ */
				632
				633	/* similar to ucnv_MBCSGetNextUChar() but recursive */
				634	static UBool
				635	enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
				636	int32_t state, uint32_t offset,
				637	uint32_t value,
				638	UConverterEnumToUCallback callback, const void context,
				639	UErrorCode *pErrorCode) {
				640	UChar32 codePoints[32];
				641	const int32_t *row;
				642	const uint16_t *unicodeCodeUnits;
				643	UChar32 anyCodePoints;
				644	int32_t b, limit;
				645
				646	row=mbcsTable->stateTable[state];
				647	unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
				648
				649	value<<=8;
				650	anyCodePoints=-1; /* becomes non-negative if there is a mapping */
				651
				652	b=(stateProps[state]&0x38)<<2;
				653	if(b==0 && stateProps[state]>=0x40) {
				654	/* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
				655	codePoints[0]=U_SENTINEL;
				656	b=1;
				657	}
				658	limit=((stateProps[state]&7)+1)<<5;
				659	while(b<limit) {
				660	int32_t entry=row[b];
				661	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				662	int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
				663	if(stateProps[nextState]>=0) {
				664	/* recurse to a state with non-ignorable actions */
				665	if(!enumToU(
				666	mbcsTable, stateProps, nextState,
				667	offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
				668	value\|(uint32_t)b,
				669	callback, context,
				670	pErrorCode)) {
				671	return FALSE;
				672	}
				673	}
				674	codePoints[b&0x1f]=U_SENTINEL;
				675	} else {
				676	UChar32 c;
				677	int32_t action;
				678
				679	/*
				680	* An if-else-if chain provides more reliable performance for
				681	* the most common cases compared to a switch.
				682	*/
				683	action=MBCS_ENTRY_FINAL_ACTION(entry);
				684	if(action==MBCS_STATE_VALID_DIRECT_16) {
				685	/* output BMP code point */
				686	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				687	} else if(action==MBCS_STATE_VALID_16) {
				688	int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
				689	c=unicodeCodeUnits[finalOffset];
				690	if(c<0xfffe) {
				691	/* output BMP code point */
				692	} else {
				693	c=U_SENTINEL;
				694	}
				695	} else if(action==MBCS_STATE_VALID_16_PAIR) {
				696	int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
				697	c=unicodeCodeUnits[finalOffset++];
				698	if(c<0xd800) {
				699	/* output BMP code point below 0xd800 */
				700	} else if(c<=0xdbff) {
				701	/* output roundtrip or fallback supplementary code point */
				702	c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
				703	} else if(c==0xe000) {
				704	/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
				705	c=unicodeCodeUnits[finalOffset];
				706	} else {
				707	c=U_SENTINEL;
				708	}
				709	} else if(action==MBCS_STATE_VALID_DIRECT_20) {
				710	/* output supplementary code point */
				711	c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
				712	} else {
				713	c=U_SENTINEL;
				714	}
				715
				716	codePoints[b&0x1f]=c;
				717	anyCodePoints&=c;
				718	}
				719	if(((++b)&0x1f)==0) {
				720	if(anyCodePoints>=0) {
				721	if(!callback(context, value\|(uint32_t)(b-0x20), codePoints)) {
				722	return FALSE;
				723	}
				724	anyCodePoints=-1;
				725	}
				726	}
				727	}
				728	return TRUE;
				729	}
				730
				731	/*
				732	* Only called if stateProps[state]==-1.
				733	* A recursive call may do stateProps[state]\|=0x40 if this state is the target of an
				734	* MBCS_STATE_CHANGE_ONLY.
				735	*/
				736	static int8_t
				737	getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
				738	const int32_t *row;
				739	int32_t min, max, entry, nextState;
				740
				741	row=stateTable[state];
				742	stateProps[state]=0;
				743
				744	/* find first non-ignorable state */
				745	for(min=0;; ++min) {
				746	entry=row[min];
				747	nextState=MBCS_ENTRY_STATE(entry);
				748	if(stateProps[nextState]==-1) {
				749	getStateProp(stateTable, stateProps, nextState);
				750	}
				751	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				752	if(stateProps[nextState]>=0) {
				753	break;
				754	}
				755	} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
				756	break;
				757	}
				758	if(min==0xff) {
				759	stateProps[state]=-0x40; /* (int8_t)0xc0 */
				760	return stateProps[state];
				761	}
				762	}
				763	stateProps[state]\|=(int8_t)((min>>5)<<3);
				764
				765	/* find last non-ignorable state */
				766	for(max=0xff; min<max; --max) {
				767	entry=row[max];
				768	nextState=MBCS_ENTRY_STATE(entry);
				769	if(stateProps[nextState]==-1) {
				770	getStateProp(stateTable, stateProps, nextState);
				771	}
				772	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				773	if(stateProps[nextState]>=0) {
				774	break;
				775	}
				776	} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
				777	break;
				778	}
				779	}
				780	stateProps[state]\|=(int8_t)(max>>5);
				781
				782	/* recurse further and collect direct-state information */
				783	while(min<=max) {
				784	entry=row[min];
				785	nextState=MBCS_ENTRY_STATE(entry);
				786	if(stateProps[nextState]==-1) {
				787	getStateProp(stateTable, stateProps, nextState);
				788	}
				789	if(MBCS_ENTRY_IS_FINAL(entry)) {
				790	stateProps[nextState]\|=0x40;
				791	if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
				792	stateProps[state]\|=0x40;
				793	}
				794	}
				795	++min;
				796	}
				797	return stateProps[state];
				798	}
				799
				800	/*
				801	* Internal function enumerating the toUnicode data of an MBCS converter.
				802	* Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
				803	* table, but could also be used for a future ucnv_getUnicodeSet() option
				804	* that includes reverse fallbacks (after updating this function's implementation).
				805	* Currently only handles roundtrip mappings.
				806	* Does not currently handle extensions.
				807	*/
				808	static void
				809	ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
				810	UConverterEnumToUCallback callback, const void context,
				811	UErrorCode *pErrorCode) {
				812	/*
				813	* Properties for each state, to speed up the enumeration.
				814	* Ignorable actions are unassigned/illegal/state-change-only:
				815	* They do not lead to mappings.
				816	*
				817	* Bits 7..6:
				818	* 1 direct/initial state (stateful converters have multiple)
				819	* 0 non-initial state with transitions or with non-ignorable result actions
				820	* -1 final state with only ignorable actions
				821	*
				822	* Bits 5..3:
				823	* The lowest byte value with non-ignorable actions is
				824	* value<<5 (rounded down).
				825	*
				826	* Bits 2..0:
				827	* The highest byte value with non-ignorable actions is
				828	* (value<<5)&0x1f (rounded up).
				829	*/
				830	int8_t stateProps[MBCS_MAX_STATE_COUNT];
				831	int32_t state;
				832
				833	uprv_memset(stateProps, -1, sizeof(stateProps));
				834
				835	/* recurse from state 0 and set all stateProps */
				836	getStateProp(mbcsTable->stateTable, stateProps, 0);
				837
				838	for(state=0; state<mbcsTable->countStates; ++state) {
				839	/*if(stateProps[state]==-1) {
				840	printf("unused/unreachable <icu:state> %d\n", state);
				841	}*/
				842	if(stateProps[state]>=0x40) {
				843	/* start from each direct state */
				844	enumToU(
				845	mbcsTable, stateProps, state, 0, 0,
				846	callback, context,
				847	pErrorCode);
				848	}
				849	}
				850	}
				851
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	852	U_CFUNC void
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	853	ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
				854	const USetAdder *sa,
				855	UConverterUnicodeSet which,
				856	UConverterSetFilter filter,
				857	UErrorCode *pErrorCode) {
				858	const UConverterMBCSTable *mbcsTable;
				859	const uint16_t *table;
				860
				861	uint32_t st3;
				862	uint16_t st1, maxStage1, st2;
				863
				864	UChar32 c;
				865
				866	/* enumerate the from-Unicode trie table */
				867	mbcsTable=&sharedData->mbcs;
				868	table=mbcsTable->fromUnicodeTable;
				869	if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
				870	maxStage1=0x440;
				871	} else {
				872	maxStage1=0x40;
				873	}
				874
				875	c=0; /* keep track of the current code point while enumerating */
				876
				877	if(mbcsTable->outputType==MBCS_OUTPUT_1) {
				878	const uint16_t stage2, stage3, *results;
				879	uint16_t minValue;
				880
				881	results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
				882
				883	/*
				884	* Set a threshold variable for selecting which mappings to use.
				885	* See ucnv_MBCSSingleFromBMPWithOffsets() and
				886	* MBCS_SINGLE_RESULT_FROM_U() for details.
				887	*/
				888	if(which==UCNV_ROUNDTRIP_SET) {
				889	/* use only roundtrips */
				890	minValue=0xf00;
				891	} else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
				892	/* use all roundtrip and fallback results */
				893	minValue=0x800;
				894	}
				895
				896	for(st1=0; st1<maxStage1; ++st1) {
				897	st2=table[st1];
				898	if(st2>maxStage1) {
				899	stage2=table+st2;
				900	for(st2=0; st2<64; ++st2) {
				901	if((st3=stage2[st2])!=0) {
				902	/* read the stage 3 block */
				903	stage3=results+st3;
				904
				905	do {
				906	if(*stage3++>=minValue) {
				907	sa->add(sa->set, c);
				908	}
				909	} while((++c&0xf)!=0);
				910	} else {
				911	c+=16; /* empty stage 3 block */
				912	}
				913	}
				914	} else {
				915	c+=1024; /* empty stage 2 block */
				916	}
				917	}
				918	} else {
				919	const uint32_t *stage2;
				920	const uint8_t stage3, bytes;
				921	uint32_t st3Multiplier;
				922	uint32_t value;
				923	UBool useFallback;
				924
				925	bytes=mbcsTable->fromUnicodeBytes;
				926
				927	useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
				928
				929	switch(mbcsTable->outputType) {
				930	case MBCS_OUTPUT_3:
				931	case MBCS_OUTPUT_4_EUC:
				932	st3Multiplier=3;
				933	break;
				934	case MBCS_OUTPUT_4:
				935	st3Multiplier=4;
				936	break;
				937	default:
				938	st3Multiplier=2;
				939	break;
				940	}
				941
				942	for(st1=0; st1<maxStage1; ++st1) {
				943	st2=table[st1];
				944	if(st2>(maxStage1>>1)) {
				945	stage2=(const uint32_t *)table+st2;
				946	for(st2=0; st2<64; ++st2) {
				947	if((st3=stage2[st2])!=0) {
				948	/* read the stage 3 block */
				949	stage3=bytes+st3Multiplier16(uint32_t)(uint16_t)st3;
				950
				951	/* get the roundtrip flags for the stage 3 block */
				952	st3>>=16;
				953
				954	/*
				955	* Add code points for which the roundtrip flag is set,
				956	* or which map to non-zero bytes if we use fallbacks.
				957	* See ucnv_MBCSFromUnicodeWithOffsets() for details.
				958	*/
				959	switch(filter) {
				960	case UCNV_SET_FILTER_NONE:
				961	do {
				962	if(st3&1) {
				963	sa->add(sa->set, c);
				964	stage3+=st3Multiplier;
				965	} else if(useFallback) {
				966	uint8_t b=0;
				967	switch(st3Multiplier) {
				968	case 4:
				969	b\|=*stage3++;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	970	U_FALLTHROUGH;
				971	case 3:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	972	b\|=*stage3++;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	973	U_FALLTHROUGH;
				974	case 2:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	975	b\|=stage3[0]\|stage3[1];
				976	stage3+=2;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	977	U_FALLTHROUGH;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	978	default:
				979	break;
				980	}
				981	if(b!=0) {
				982	sa->add(sa->set, c);
				983	}
				984	}
				985	st3>>=1;
				986	} while((++c&0xf)!=0);
				987	break;
				988	case UCNV_SET_FILTER_DBCS_ONLY:
				989	/* Ignore single-byte results (<0x100). */
				990	do {
				991	if(((st3&1)!=0 \|\| useFallback) && ((const uint16_t )stage3)>=0x100) {
				992	sa->add(sa->set, c);
				993	}
				994	st3>>=1;
				995	stage3+=2; /* +=st3Multiplier */
				996	} while((++c&0xf)!=0);
				997	break;
				998	case UCNV_SET_FILTER_2022_CN:
				999	/* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
				1000	do {
				1001	if(((st3&1)!=0 \|\| useFallback) && ((value=*stage3)==0x81 \|\| value==0x82)) {
				1002	sa->add(sa->set, c);
				1003	}
				1004	st3>>=1;
				1005	stage3+=3; /* +=st3Multiplier */
				1006	} while((++c&0xf)!=0);
				1007	break;
				1008	case UCNV_SET_FILTER_SJIS:
				1009	/* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
				1010	do {
				1011	if(((st3&1)!=0 \|\| useFallback) && (value=((const uint16_t )stage3))>=0x8140 && value<=0xeffc) {
				1012	sa->add(sa->set, c);
				1013	}
				1014	st3>>=1;
				1015	stage3+=2; /* +=st3Multiplier */
				1016	} while((++c&0xf)!=0);
				1017	break;
				1018	case UCNV_SET_FILTER_GR94DBCS:
				1019	/* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
				1020	do {
				1021	if( ((st3&1)!=0 \|\| useFallback) &&
				1022	(uint16_t)((value=((const uint16_t )stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
				1023	(uint8_t)(value-0xa1)<=(0xfe - 0xa1)
				1024	) {
				1025	sa->add(sa->set, c);
				1026	}
				1027	st3>>=1;
				1028	stage3+=2; /* +=st3Multiplier */
				1029	} while((++c&0xf)!=0);
				1030	break;
				1031	case UCNV_SET_FILTER_HZ:
				1032	/* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
				1033	do {
				1034	if( ((st3&1)!=0 \|\| useFallback) &&
				1035	(uint16_t)((value=((const uint16_t )stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
				1036	(uint8_t)(value-0xa1)<=(0xfe - 0xa1)
				1037	) {
				1038	sa->add(sa->set, c);
				1039	}
				1040	st3>>=1;
				1041	stage3+=2; /* +=st3Multiplier */
				1042	} while((++c&0xf)!=0);
				1043	break;
				1044	default:
				1045	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
				1046	return;
				1047	}
				1048	} else {
				1049	c+=16; /* empty stage 3 block */
				1050	}
				1051	}
				1052	} else {
				1053	c+=1024; /* empty stage 2 block */
				1054	}
				1055	}
				1056	}
				1057
				1058	ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
				1059	}
				1060
				1061	U_CFUNC void
				1062	ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
				1063	const USetAdder *sa,
				1064	UConverterUnicodeSet which,
				1065	UErrorCode *pErrorCode) {
				1066	ucnv_MBCSGetFilteredUnicodeSetForUnicode(
				1067	sharedData, sa, which,
				1068	sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
				1069	UCNV_SET_FILTER_DBCS_ONLY :
				1070	UCNV_SET_FILTER_NONE,
				1071	pErrorCode);
				1072	}
				1073
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1074	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1075	ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
				1076	const USetAdder *sa,
				1077	UConverterUnicodeSet which,
				1078	UErrorCode *pErrorCode) {
				1079	if(cnv->options&_MBCS_OPTION_GB18030) {
				1080	sa->addRange(sa->set, 0, 0xd7ff);
				1081	sa->addRange(sa->set, 0xe000, 0x10ffff);
				1082	} else {
				1083	ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
				1084	}
				1085	}
				1086
				1087	/* conversion extensions for input not in the main table -------------------- */
				1088
				1089	/*
				1090	* Hardcoded extension handling for GB 18030.
				1091	* Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
				1092	*
				1093	* In the future, conversion extensions may handle m:n mappings and delta tables,
				1094	* see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
				1095	*
				1096	* If an input character cannot be mapped, then these functions set an error
				1097	* code. The framework will then call the callback function.
				1098	*/
				1099
				1100	/*
				1101	* @return if(U_FAILURE) return the code point for cnv->fromUChar32
				1102	* else return 0 after output has been written to the target
				1103	*/
				1104	static UChar32
				1105	_extFromU(UConverter cnv, const UConverterSharedData sharedData,
				1106	UChar32 cp,
				1107	const UChar *source, const UChar sourceLimit,
				1108	uint8_t *target, const uint8_t targetLimit,
				1109	int32_t **offsets, int32_t sourceIndex,
				1110	UBool flush,
				1111	UErrorCode *pErrorCode) {
				1112	const int32_t *cx;
				1113
				1114	cnv->useSubChar1=FALSE;
				1115
				1116	if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
				1117	ucnv_extInitialMatchFromU(
				1118	cnv, cx,
				1119	cp, source, sourceLimit,
				1120	(char *)target, (char )targetLimit,
				1121	offsets, sourceIndex,
				1122	flush,
				1123	pErrorCode)
				1124	) {
				1125	return 0; /* an extension mapping handled the input */
				1126	}
				1127
				1128	/* GB 18030 */
				1129	if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
				1130	const uint32_t *range;
				1131	int32_t i;
				1132
				1133	range=gb18030Ranges[0];
				1134	for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
				1135	if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
				1136	/* found the Unicode code point, output the four-byte sequence for it */
				1137	uint32_t linear;
				1138	char bytes[4];
				1139
				1140	/* get the linear value of the first GB 18030 code in this range */
				1141	linear=range[2]-LINEAR_18030_BASE;
				1142
				1143	/* add the offset from the beginning of the range */
				1144	linear+=((uint32_t)cp-range[0]);
				1145
				1146	/* turn this into a four-byte sequence */
				1147	bytes[3]=(char)(0x30+linear%10); linear/=10;
				1148	bytes[2]=(char)(0x81+linear%126); linear/=126;
				1149	bytes[1]=(char)(0x30+linear%10); linear/=10;
				1150	bytes[0]=(char)(0x81+linear);
				1151
				1152	/* output this sequence */
				1153	ucnv_fromUWriteBytes(cnv,
				1154	bytes, 4, (char *)target, (char )targetLimit,
				1155	offsets, sourceIndex, pErrorCode);
				1156	return 0;
				1157	}
				1158	}
				1159	}
				1160
				1161	/* no mapping */
				1162	*pErrorCode=U_INVALID_CHAR_FOUND;
				1163	return cp;
				1164	}
				1165
				1166	/*
				1167	* Input sequence: cnv->toUBytes[0..length[
				1168	* @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
				1169	* else return 0 after output has been written to the target
				1170	*/
				1171	static int8_t
				1172	_extToU(UConverter cnv, const UConverterSharedData sharedData,
				1173	int8_t length,
				1174	const uint8_t *source, const uint8_t sourceLimit,
				1175	UChar *target, const UChar targetLimit,
				1176	int32_t **offsets, int32_t sourceIndex,
				1177	UBool flush,
				1178	UErrorCode *pErrorCode) {
				1179	const int32_t *cx;
				1180
				1181	if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
				1182	ucnv_extInitialMatchToU(
				1183	cnv, cx,
				1184	length, (const char *)source, (const char )sourceLimit,
				1185	target, targetLimit,
				1186	offsets, sourceIndex,
				1187	flush,
				1188	pErrorCode)
				1189	) {
				1190	return 0; /* an extension mapping handled the input */
				1191	}
				1192
				1193	/* GB 18030 */
				1194	if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
				1195	const uint32_t *range;
				1196	uint32_t linear;
				1197	int32_t i;
				1198
				1199	linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
				1200	range=gb18030Ranges[0];
				1201	for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
				1202	if(range[2]<=linear && linear<=range[3]) {
				1203	/* found the sequence, output the Unicode code point for it */
				1204	*pErrorCode=U_ZERO_ERROR;
				1205
				1206	/* add the linear difference between the input and start sequences to the start code point */
				1207	linear=range[0]+(linear-range[2]);
				1208
				1209	/* output this code point */
				1210	ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
				1211
				1212	return 0;
				1213	}
				1214	}
				1215	}
				1216
				1217	/* no mapping */
				1218	*pErrorCode=U_INVALID_CHAR_FOUND;
				1219	return length;
				1220	}
				1221
				1222	/* EBCDIC swap LF<->NL ------------------------------------------------------ */
				1223
				1224	/*
				1225	* This code modifies a standard EBCDIC<->Unicode mapping table for
				1226	* OS/390 (z/OS) Unix System Services (Open Edition).
				1227	* The difference is in the mapping of Line Feed and New Line control codes:
				1228	* Standard EBCDIC maps
				1229	*
				1230	* <U000A> \x25 \|0
				1231	* <U0085> \x15 \|0
				1232	*
				1233	* but OS/390 USS EBCDIC swaps the control codes for LF and NL,
				1234	* mapping
				1235	*
				1236	* <U000A> \x15 \|0
				1237	* <U0085> \x25 \|0
				1238	*
				1239	* This code modifies a loaded standard EBCDIC<->Unicode mapping table
				1240	* by copying it into allocated memory and swapping the LF and NL values.
				1241	* It allows to support the same EBCDIC charset in both versions without
				1242	* duplicating the entire installed table.
				1243	*/
				1244
				1245	/* standard EBCDIC codes */
				1246	#define EBCDIC_LF 0x25
				1247	#define EBCDIC_NL 0x15
				1248
				1249	/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
				1250	#define EBCDIC_RT_LF 0xf25
				1251	#define EBCDIC_RT_NL 0xf15
				1252
				1253	/* Unicode code points */
				1254	#define U_LF 0x0a
				1255	#define U_NL 0x85
				1256
				1257	static UBool
				1258	_EBCDICSwapLFNL(UConverterSharedData sharedData, UErrorCode pErrorCode) {
				1259	UConverterMBCSTable *mbcsTable;
				1260
				1261	const uint16_t table, results;
				1262	const uint8_t *bytes;
				1263
				1264	int32_t (*newStateTable)[256];
				1265	uint16_t *newResults;
				1266	uint8_t *p;
				1267	char *name;
				1268
				1269	uint32_t stage2Entry;
				1270	uint32_t size, sizeofFromUBytes;
				1271
				1272	mbcsTable=&sharedData->mbcs;
				1273
				1274	table=mbcsTable->fromUnicodeTable;
				1275	bytes=mbcsTable->fromUnicodeBytes;
				1276	results=(const uint16_t *)bytes;
				1277
				1278	/*
				1279	* Check that this is an EBCDIC table with SBCS portion -
				1280	* SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
				1281	*
				1282	* If not, ignore the option. Options are always ignored if they do not apply.
				1283	*/
				1284	if(!(
				1285	(mbcsTable->outputType==MBCS_OUTPUT_1 \|\| mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
				1286	mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
				1287	mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
				1288	)) {
				1289	return FALSE;
				1290	}
				1291
				1292	if(mbcsTable->outputType==MBCS_OUTPUT_1) {
				1293	if(!(
				1294	EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
				1295	EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
				1296	)) {
				1297	return FALSE;
				1298	}
				1299	} else /* MBCS_OUTPUT_2_SISO */ {
				1300	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
				1301	if(!(
				1302	MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
				1303	EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
				1304	)) {
				1305	return FALSE;
				1306	}
				1307
				1308	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
				1309	if(!(
				1310	MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
				1311	EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
				1312	)) {
				1313	return FALSE;
				1314	}
				1315	}
				1316
				1317	if(mbcsTable->fromUBytesLength>0) {
				1318	/*
				1319	* We _know_ the number of bytes in the fromUnicodeBytes array
				1320	* starting with header.version 4.1.
				1321	*/
				1322	sizeofFromUBytes=mbcsTable->fromUBytesLength;
				1323	} else {
				1324	/*
				1325	* Otherwise:
				1326	* There used to be code to enumerate the fromUnicode
				1327	* trie and find the highest entry, but it was removed in ICU 3.2
				1328	* because it was not tested and caused a low code coverage number.
				1329	* See Jitterbug 3674.
				1330	* This affects only some .cnv file formats with a header.version
				1331	* below 4.1, and only when swaplfnl is requested.
				1332	*
				1333	* ucnvmbcs.c revision 1.99 is the last one with the
				1334	* ucnv_MBCSSizeofFromUBytes() function.
				1335	*/
				1336	*pErrorCode=U_INVALID_FORMAT_ERROR;
				1337	return FALSE;
				1338	}
				1339
				1340	/*
				1341	* The table has an appropriate format.
				1342	* Allocate and build
				1343	* - a modified to-Unicode state table
				1344	* - a modified from-Unicode output array
				1345	* - a converter name string with the swap option appended
				1346	*/
				1347	size=
				1348	mbcsTable->countStates*1024+
				1349	sizeofFromUBytes+
				1350	UCNV_MAX_CONVERTER_NAME_LENGTH+20;
				1351	p=(uint8_t *)uprv_malloc(size);
				1352	if(p==NULL) {
				1353	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
				1354	return FALSE;
				1355	}
				1356
				1357	/* copy and modify the to-Unicode state table */
				1358	newStateTable=(int32_t (*)[256])p;
				1359	uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
				1360
				1361	newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
				1362	newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
				1363
				1364	/* copy and modify the from-Unicode result table */
				1365	newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
				1366	uprv_memcpy(newResults, bytes, sizeofFromUBytes);
				1367
				1368	/* conveniently, the table access macros work on the left side of expressions */
				1369	if(mbcsTable->outputType==MBCS_OUTPUT_1) {
				1370	MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
				1371	MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
				1372	} else /* MBCS_OUTPUT_2_SISO */ {
				1373	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
				1374	MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
				1375
				1376	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
				1377	MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
				1378	}
				1379
				1380	/* set the canonical converter name */
				1381	name=(char *)newResults+sizeofFromUBytes;
				1382	uprv_strcpy(name, sharedData->staticData->name);
				1383	uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
				1384
				1385	/* set the pointers */
				1386	umtx_lock(NULL);
				1387	if(mbcsTable->swapLFNLStateTable==NULL) {
				1388	mbcsTable->swapLFNLStateTable=newStateTable;
				1389	mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
				1390	mbcsTable->swapLFNLName=name;
				1391
				1392	newStateTable=NULL;
				1393	}
				1394	umtx_unlock(NULL);
				1395
				1396	/* release the allocated memory if another thread beat us to it */
				1397	if(newStateTable!=NULL) {
				1398	uprv_free(newStateTable);
				1399	}
				1400	return TRUE;
				1401	}
				1402
				1403	/* reconstitute omitted fromUnicode data ------------------------------------ */
				1404
				1405	/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
				1406	static UBool U_CALLCONV
				1407	writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
				1408	UConverterMBCSTable mbcsTable=(UConverterMBCSTable )context;
				1409	const uint16_t *table;
				1410	uint32_t *stage2;
				1411	uint8_t bytes, p;
				1412	UChar32 c;
				1413	int32_t i, st3;
				1414
				1415	table=mbcsTable->fromUnicodeTable;
				1416	bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
				1417
				1418	/* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
				1419	switch(mbcsTable->outputType) {
				1420	case MBCS_OUTPUT_3_EUC:
				1421	if(value<=0xffff) {
				1422	/* short sequences are stored directly */
				1423	/* code set 0 or 1 */
				1424	} else if(value<=0x8effff) {
				1425	/* code set 2 */
				1426	value&=0x7fff;
				1427	} else /* first byte is 0x8f */ {
				1428	/* code set 3 */
				1429	value&=0xff7f;
				1430	}
				1431	break;
				1432	case MBCS_OUTPUT_4_EUC:
				1433	if(value<=0xffffff) {
				1434	/* short sequences are stored directly */
				1435	/* code set 0 or 1 */
				1436	} else if(value<=0x8effffff) {
				1437	/* code set 2 */
				1438	value&=0x7fffff;
				1439	} else /* first byte is 0x8f */ {
				1440	/* code set 3 */
				1441	value&=0xff7fff;
				1442	}
				1443	break;
				1444	default:
				1445	break;
				1446	}
				1447
				1448	for(i=0; i<=0x1f; ++value, ++i) {
				1449	c=codePoints[i];
				1450	if(c<0) {
				1451	continue;
				1452	}
				1453
				1454	/* locate the stage 2 & 3 data */
				1455	stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
				1456	p=bytes;
				1457	st3=(int32_t)(uint16_t)stage216+(c&0xf);
				1458
				1459	/* write the codepage bytes into stage 3 */
				1460	switch(mbcsTable->outputType) {
				1461	case MBCS_OUTPUT_3:
				1462	case MBCS_OUTPUT_4_EUC:
				1463	p+=st3*3;
				1464	p[0]=(uint8_t)(value>>16);
				1465	p[1]=(uint8_t)(value>>8);
				1466	p[2]=(uint8_t)value;
				1467	break;
				1468	case MBCS_OUTPUT_4:
				1469	((uint32_t *)p)[st3]=value;
				1470	break;
				1471	default:
				1472	/* 2 bytes per character */
				1473	((uint16_t *)p)[st3]=(uint16_t)value;
				1474	break;
				1475	}
				1476
				1477	/* set the roundtrip flag */
				1478	*stage2\|=(1UL<<(16+(c&0xf)));
				1479	}
				1480	return TRUE;
				1481	}
				1482
				1483	static void
				1484	reconstituteData(UConverterMBCSTable *mbcsTable,
				1485	uint32_t stage1Length, uint32_t stage2Length,
				1486	uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */
				1487	UErrorCode *pErrorCode) {
				1488	uint16_t *stage1;
				1489	uint32_t *stage2;
				1490	uint32_t dataLength=stage1Length2+fullStage2Length4+mbcsTable->fromUBytesLength;
				1491	mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
				1492	if(mbcsTable->reconstitutedData==NULL) {
				1493	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
				1494	return;
				1495	}
				1496	uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
				1497
				1498	/* copy existing data and reroute the pointers */
				1499	stage1=(uint16_t *)mbcsTable->reconstitutedData;
				1500	uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
				1501
				1502	stage2=(uint32_t *)(stage1+stage1Length);
				1503	uprv_memcpy(stage2+(fullStage2Length-stage2Length),
				1504	mbcsTable->fromUnicodeTable+stage1Length,
				1505	stage2Length*4);
				1506
				1507	mbcsTable->fromUnicodeTable=stage1;
				1508	mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
				1509
				1510	/* indexes into stage 2 count from the bottom of the fromUnicodeTable */
				1511	stage2=(uint32_t *)stage1;
				1512
				1513	/* reconstitute the initial part of stage 2 from the mbcsIndex */
				1514	{
				1515	int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
				1516	int32_t stageUTF8Index=0;
				1517	int32_t st1, st2, st3, i;
				1518
				1519	for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
				1520	st2=stage1[st1];
				1521	if(st2!=(int32_t)stage1Length/2) {
				1522	/* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
				1523	for(i=0; i<16; ++i) {
				1524	st3=mbcsTable->mbcsIndex[stageUTF8Index++];
				1525	if(st3!=0) {
				1526	/* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
				1527	st3>>=4;
				1528	/*
				1529	* 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
				1530	* allocated together as a single 64-block for access from the mbcsIndex
				1531	*/
				1532	stage2[st2++]=st3++;
				1533	stage2[st2++]=st3++;
				1534	stage2[st2++]=st3++;
				1535	stage2[st2++]=st3;
				1536	} else {
				1537	/* no stage 3 block, skip */
				1538	st2+=4;
				1539	}
				1540	}
				1541	} else {
				1542	/* no stage 2 block, skip */
				1543	stageUTF8Index+=16;
				1544	}
				1545	}
				1546	}
				1547
				1548	/* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
				1549	ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
				1550	}
				1551
				1552	/* MBCS setup functions ----------------------------------------------------- */
				1553
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1554	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1555	ucnv_MBCSLoad(UConverterSharedData *sharedData,
				1556	UConverterLoadArgs *pArgs,
				1557	const uint8_t *raw,
				1558	UErrorCode *pErrorCode) {
				1559	UDataInfo info;
				1560	UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
				1561	_MBCSHeader header=(_MBCSHeader )raw;
				1562	uint32_t offset;
				1563	uint32_t headerLength;
				1564	UBool noFromU=FALSE;
				1565
				1566	if(header->version[0]==4) {
				1567	headerLength=MBCS_HEADER_V4_LENGTH;
				1568	} else if(header->version[0]==5 && header->version[1]>=3 &&
				1569	(header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
				1570	headerLength=header->options&MBCS_OPT_LENGTH_MASK;
				1571	noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
				1572	} else {
				1573	*pErrorCode=U_INVALID_TABLE_FORMAT;
				1574	return;
				1575	}
				1576
				1577	mbcsTable->outputType=(uint8_t)header->flags;
				1578	if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
				1579	*pErrorCode=U_INVALID_TABLE_FORMAT;
				1580	return;
				1581	}
				1582
				1583	/* extension data, header version 4.2 and higher */
				1584	offset=header->flags>>8;
				1585	if(offset!=0) {
				1586	mbcsTable->extIndexes=(const int32_t *)(raw+offset);
				1587	}
				1588
				1589	if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
				1590	UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER;
				1591	UConverterSharedData *baseSharedData;
				1592	const int32_t *extIndexes;
				1593	const char *baseName;
				1594
				1595	/* extension-only file, load the base table and set values appropriately */
				1596	if((extIndexes=mbcsTable->extIndexes)==NULL) {
				1597	/* extension-only file without extension */
				1598	*pErrorCode=U_INVALID_TABLE_FORMAT;
				1599	return;
				1600	}
				1601
				1602	if(pArgs->nestedLoads!=1) {
				1603	/* an extension table must not be loaded as a base table */
				1604	*pErrorCode=U_INVALID_TABLE_FILE;
				1605	return;
				1606	}
				1607
				1608	/* load the base table */
				1609	baseName=(const char )header+headerLength4;
				1610	if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
				1611	/* forbid loading this same extension-only file */
				1612	*pErrorCode=U_INVALID_TABLE_FORMAT;
				1613	return;
				1614	}
				1615
				1616	/* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
				1617	args.size=sizeof(UConverterLoadArgs);
				1618	args.nestedLoads=2;
				1619	args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
				1620	args.reserved=pArgs->reserved;
				1621	args.options=pArgs->options;
				1622	args.pkg=pArgs->pkg;
				1623	args.name=baseName;
				1624	baseSharedData=ucnv_load(&args, pErrorCode);
				1625	if(U_FAILURE(*pErrorCode)) {
				1626	return;
				1627	}
				1628	if( baseSharedData->staticData->conversionType!=UCNV_MBCS \|\|
				1629	baseSharedData->mbcs.baseSharedData!=NULL
				1630	) {
				1631	ucnv_unload(baseSharedData);
				1632	*pErrorCode=U_INVALID_TABLE_FORMAT;
				1633	return;
				1634	}
				1635	if(pArgs->onlyTestIsLoadable) {
				1636	/*
				1637	* Exit as soon as we know that we can load the converter
				1638	* and the format is valid and supported.
				1639	* The worst that can happen in the following code is a memory
				1640	* allocation error.
				1641	*/
				1642	ucnv_unload(baseSharedData);
				1643	return;
				1644	}
				1645
				1646	/* copy the base table data */
				1647	uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
				1648
				1649	/* overwrite values with relevant ones for the extension converter */
				1650	mbcsTable->baseSharedData=baseSharedData;
				1651	mbcsTable->extIndexes=extIndexes;
				1652
				1653	/*
				1654	* It would be possible to share the swapLFNL data with a base converter,
				1655	* but the generated name would have to be different, and the memory
				1656	* would have to be free'd only once.
				1657	* It is easier to just create the data for the extension converter
				1658	* separately when it is requested.
				1659	*/
				1660	mbcsTable->swapLFNLStateTable=NULL;
				1661	mbcsTable->swapLFNLFromUnicodeBytes=NULL;
				1662	mbcsTable->swapLFNLName=NULL;
				1663
				1664	/*
				1665	* The reconstitutedData must be deleted only when the base converter
				1666	* is unloaded.
				1667	*/
				1668	mbcsTable->reconstitutedData=NULL;
				1669
				1670	/*
				1671	* Set a special, runtime-only outputType if the extension converter
				1672	* is a DBCS version of a base converter that also maps single bytes.
				1673	*/
				1674	if( sharedData->staticData->conversionType==UCNV_DBCS \|\|
				1675	(sharedData->staticData->conversionType==UCNV_MBCS &&
				1676	sharedData->staticData->minBytesPerChar>=2)
				1677	) {
				1678	if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
				1679	/* the base converter is SI/SO-stateful */
				1680	int32_t entry;
				1681
				1682	/* get the dbcs state from the state table entry for SO=0x0e */
				1683	entry=mbcsTable->stateTable[0][0xe];
				1684	if( MBCS_ENTRY_IS_FINAL(entry) &&
				1685	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
				1686	MBCS_ENTRY_FINAL_STATE(entry)!=0
				1687	) {
				1688	mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
				1689
				1690	mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
				1691	}
				1692	} else if(
				1693	baseSharedData->staticData->conversionType==UCNV_MBCS &&
				1694	baseSharedData->staticData->minBytesPerChar==1 &&
				1695	baseSharedData->staticData->maxBytesPerChar==2 &&
				1696	mbcsTable->countStates<=127
				1697	) {
				1698	/* non-stateful base converter, need to modify the state table */
				1699	int32_t (*newStateTable)[256];
				1700	int32_t *state;
				1701	int32_t i, count;
				1702
				1703	/* allocate a new state table and copy the base state table contents */
				1704	count=mbcsTable->countStates;
				1705	newStateTable=(int32_t ()[256])uprv_malloc((count+1)1024);
				1706	if(newStateTable==NULL) {
				1707	ucnv_unload(baseSharedData);
				1708	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
				1709	return;
				1710	}
				1711
				1712	uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
				1713
				1714	/* change all final single-byte entries to go to a new all-illegal state */
				1715	state=newStateTable[0];
				1716	for(i=0; i<256; ++i) {
				1717	if(MBCS_ENTRY_IS_FINAL(state[i])) {
				1718	state[i]=MBCS_ENTRY_TRANSITION(count, 0);
				1719	}
				1720	}
				1721
				1722	/* build the new all-illegal state */
				1723	state=newStateTable[count];
				1724	for(i=0; i<256; ++i) {
				1725	state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
				1726	}
				1727	mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
				1728	mbcsTable->countStates=(uint8_t)(count+1);
				1729	mbcsTable->stateTableOwned=TRUE;
				1730
				1731	mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
				1732	}
				1733	}
				1734
				1735	/*
				1736	* unlike below for files with base tables, do not get the unicodeMask
				1737	* from the sharedData; instead, use the base table's unicodeMask,
				1738	* which we copied in the memcpy above;
				1739	* this is necessary because the static data unicodeMask, especially
				1740	* the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
				1741	*/
				1742	} else {
				1743	/* conversion file with a base table; an additional extension table is optional */
				1744	/* make sure that the output type is known */
				1745	switch(mbcsTable->outputType) {
				1746	case MBCS_OUTPUT_1:
				1747	case MBCS_OUTPUT_2:
				1748	case MBCS_OUTPUT_3:
				1749	case MBCS_OUTPUT_4:
				1750	case MBCS_OUTPUT_3_EUC:
				1751	case MBCS_OUTPUT_4_EUC:
				1752	case MBCS_OUTPUT_2_SISO:
				1753	/* OK */
				1754	break;
				1755	default:
				1756	*pErrorCode=U_INVALID_TABLE_FORMAT;
				1757	return;
				1758	}
				1759	if(pArgs->onlyTestIsLoadable) {
				1760	/*
				1761	* Exit as soon as we know that we can load the converter
				1762	* and the format is valid and supported.
				1763	* The worst that can happen in the following code is a memory
				1764	* allocation error.
				1765	*/
				1766	return;
				1767	}
				1768
				1769	mbcsTable->countStates=(uint8_t)header->countStates;
				1770	mbcsTable->countToUFallbacks=header->countToUFallbacks;
				1771	mbcsTable->stateTable=(const int32_t ()[256])(raw+headerLength4);
				1772	mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
				1773	mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
				1774
				1775	mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
				1776	mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
				1777	mbcsTable->fromUBytesLength=header->fromUBytesLength;
				1778
				1779	/*
				1780	* converter versions 6.1 and up contain a unicodeMask that is
				1781	* used here to select the most efficient function implementations
				1782	*/
				1783	info.size=sizeof(UDataInfo);
				1784	udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
				1785	if(info.formatVersion[0]>6 \|\| (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
				1786	/* mask off possible future extensions to be safe */
				1787	mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
				1788	} else {
				1789	/* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
				1790	mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY\|UCNV_HAS_SURROGATES;
				1791	}
				1792
				1793	/*
				1794	* _MBCSHeader.version 4.3 adds utf8Friendly data structures.
				1795	* Check for the header version, SBCS vs. MBCS, and for whether the
				1796	* data structures are optimized for code points as high as what the
				1797	* runtime code is designed for.
				1798	* The implementation does not handle mapping tables with entries for
				1799	* unpaired surrogates.
				1800	*/
				1801	if( header->version[1]>=3 &&
				1802	(mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
				1803	(mbcsTable->countStates==1 ?
				1804	(header->version[2]>=(SBCS_FAST_MAX>>8)) :
				1805	(header->version[2]>=(MBCS_FAST_MAX>>8))
				1806	)
				1807	) {
				1808	mbcsTable->utf8Friendly=TRUE;
				1809
				1810	if(mbcsTable->countStates==1) {
				1811	/*
				1812	* SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
				1813	* Build a table with indexes to each block, to be used instead of
				1814	* the regular stage 1/2 table.
				1815	*/
				1816	int32_t i;
				1817	for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
				1818	mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
				1819	}
				1820	/* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
				1821	mbcsTable->maxFastUChar=SBCS_FAST_MAX;
				1822	} else {
				1823	/*
				1824	* MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
				1825	* The .cnv file is prebuilt with an additional stage table with indexes
				1826	* to each block.
				1827	*/
				1828	mbcsTable->mbcsIndex=(const uint16_t *)
				1829	(mbcsTable->fromUnicodeBytes+
				1830	(noFromU ? 0 : mbcsTable->fromUBytesLength));
				1831	mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)\|0xff;
				1832	}
				1833	}
				1834
				1835	/* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
				1836	{
				1837	uint32_t asciiRoundtrips=0xffffffff;
				1838	int32_t i;
				1839
				1840	for(i=0; i<0x80; ++i) {
				1841	if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
				1842	asciiRoundtrips&=~((uint32_t)1<<(i>>2));
				1843	}
				1844	}
				1845	mbcsTable->asciiRoundtrips=asciiRoundtrips;
				1846	}
				1847
				1848	if(noFromU) {
				1849	uint32_t stage1Length=
				1850	mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
				1851	0x440 : 0x40;
				1852	uint32_t stage2Length=
				1853	(header->offsetFromUBytes-header->offsetFromUTable)/4-
				1854	stage1Length/2;
				1855	reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
				1856	}
				1857	}
				1858
				1859	/* Set the impl pointer here so that it is set for both extension-only and base tables. */
				1860	if(mbcsTable->utf8Friendly) {
				1861	if(mbcsTable->countStates==1) {
				1862	sharedData->impl=&_SBCSUTF8Impl;
				1863	} else {
				1864	if(mbcsTable->outputType==MBCS_OUTPUT_2) {
				1865	sharedData->impl=&_DBCSUTF8Impl;
				1866	}
				1867	}
				1868	}
				1869
				1870	if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY \|\| mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
				1871	/*
				1872	* MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
				1873	* MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
				1874	*/
				1875	mbcsTable->asciiRoundtrips=0;
				1876	}
				1877	}
				1878
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1879	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1880	ucnv_MBCSUnload(UConverterSharedData *sharedData) {
				1881	UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
				1882
				1883	if(mbcsTable->swapLFNLStateTable!=NULL) {
				1884	uprv_free(mbcsTable->swapLFNLStateTable);
				1885	}
				1886	if(mbcsTable->stateTableOwned) {
				1887	uprv_free((void *)mbcsTable->stateTable);
				1888	}
				1889	if(mbcsTable->baseSharedData!=NULL) {
				1890	ucnv_unload(mbcsTable->baseSharedData);
				1891	}
				1892	if(mbcsTable->reconstitutedData!=NULL) {
				1893	uprv_free(mbcsTable->reconstitutedData);
				1894	}
				1895	}
				1896
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1897	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1898	ucnv_MBCSOpen(UConverter *cnv,
				1899	UConverterLoadArgs *pArgs,
				1900	UErrorCode *pErrorCode) {
				1901	UConverterMBCSTable *mbcsTable;
				1902	const int32_t *extIndexes;
				1903	uint8_t outputType;
				1904	int8_t maxBytesPerUChar;
				1905
				1906	if(pArgs->onlyTestIsLoadable) {
				1907	return;
				1908	}
				1909
				1910	mbcsTable=&cnv->sharedData->mbcs;
				1911	outputType=mbcsTable->outputType;
				1912
				1913	if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
				1914	/* the swaplfnl option does not apply, remove it */
				1915	cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
				1916	}
				1917
				1918	if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				1919	/* do this because double-checked locking is broken */
				1920	UBool isCached;
				1921
				1922	umtx_lock(NULL);
				1923	isCached=mbcsTable->swapLFNLStateTable!=NULL;
				1924	umtx_unlock(NULL);
				1925
				1926	if(!isCached) {
				1927	if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
				1928	if(U_FAILURE(*pErrorCode)) {
				1929	return; /* something went wrong */
				1930	}
				1931
				1932	/* the option does not apply, remove it */
				1933	cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
				1934	}
				1935	}
				1936	}
				1937
				1938	if(uprv_strstr(pArgs->name, "18030")!=NULL) {
				1939	if(uprv_strstr(pArgs->name, "gb18030")!=NULL \|\| uprv_strstr(pArgs->name, "GB18030")!=NULL) {
				1940	/* set a flag for GB 18030 mode, which changes the callback behavior */
				1941	cnv->options\|=_MBCS_OPTION_GB18030;
				1942	}
				1943	} else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) \|\| (uprv_strstr(pArgs->name, "keis")!=NULL)) {
				1944	/* set a flag for KEIS converter, which changes the SI/SO character sequence */
				1945	cnv->options\|=_MBCS_OPTION_KEIS;
				1946	} else if((uprv_strstr(pArgs->name, "JEF")!=NULL) \|\| (uprv_strstr(pArgs->name, "jef")!=NULL)) {
				1947	/* set a flag for JEF converter, which changes the SI/SO character sequence */
				1948	cnv->options\|=_MBCS_OPTION_JEF;
				1949	} else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) \|\| (uprv_strstr(pArgs->name, "jips")!=NULL)) {
				1950	/* set a flag for JIPS converter, which changes the SI/SO character sequence */
				1951	cnv->options\|=_MBCS_OPTION_JIPS;
				1952	}
				1953
				1954	/* fix maxBytesPerUChar depending on outputType and options etc. */
				1955	if(outputType==MBCS_OUTPUT_2_SISO) {
				1956	cnv->maxBytesPerUChar=3; /* SO+DBCS */
				1957	}
				1958
				1959	extIndexes=mbcsTable->extIndexes;
				1960	if(extIndexes!=NULL) {
				1961	maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
				1962	if(outputType==MBCS_OUTPUT_2_SISO) {
				1963	++maxBytesPerUChar; /* SO + multiple DBCS */
				1964	}
				1965
				1966	if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
				1967	cnv->maxBytesPerUChar=maxBytesPerUChar;
				1968	}
				1969	}
				1970
				1971	#if 0
				1972	/*
				1973	* documentation of UConverter fields used for status
				1974	* all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
				1975	*/
				1976
				1977	/* toUnicode */
				1978	cnv->toUnicodeStatus=0; /* offset */
				1979	cnv->mode=0; /* state */
				1980	cnv->toULength=0; /* byteIndex */
				1981
				1982	/* fromUnicode */
				1983	cnv->fromUChar32=0;
				1984	cnv->fromUnicodeStatus=1; /* prevLength */
				1985	#endif
				1986	}
				1987
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1988	U_CDECL_BEGIN
				1989
				1990	static const char* U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	1991	ucnv_MBCSGetName(const UConverter *cnv) {
				1992	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
				1993	return cnv->sharedData->mbcs.swapLFNLName;
				1994	} else {
				1995	return cnv->sharedData->staticData->name;
				1996	}
				1997	}
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	1998	U_CDECL_END
				1999
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	2000
				2001	/* MBCS-to-Unicode conversion functions ------------------------------------- */
				2002
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2003	static UChar32 U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	2004	ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
				2005	const _MBCSToUFallback *toUFallbacks;
				2006	uint32_t i, start, limit;
				2007
				2008	limit=mbcsTable->countToUFallbacks;
				2009	if(limit>0) {
				2010	/* do a binary search for the fallback mapping */
				2011	toUFallbacks=mbcsTable->toUFallbacks;
				2012	start=0;
				2013	while(start<limit-1) {
				2014	i=(start+limit)/2;
				2015	if(offset<toUFallbacks[i].offset) {
				2016	limit=i;
				2017	} else {
				2018	start=i;
				2019	}
				2020	}
				2021
				2022	/* did we really find it? */
				2023	if(offset==toUFallbacks[start].offset) {
				2024	return toUFallbacks[start].codePoint;
				2025	}
				2026	}
				2027
				2028	return 0xfffe;
				2029	}
				2030
				2031	/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
				2032	static void
				2033	ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				2034	UErrorCode *pErrorCode) {
				2035	UConverter *cnv;
				2036	const uint8_t source, sourceLimit;
				2037	UChar *target;
				2038	const UChar *targetLimit;
				2039	int32_t *offsets;
				2040
				2041	const int32_t (*stateTable)[256];
				2042
				2043	int32_t sourceIndex;
				2044
				2045	int32_t entry;
				2046	UChar c;
				2047	uint8_t action;
				2048
				2049	/* set up the local pointers */
				2050	cnv=pArgs->converter;
				2051	source=(const uint8_t *)pArgs->source;
				2052	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				2053	target=pArgs->target;
				2054	targetLimit=pArgs->targetLimit;
				2055	offsets=pArgs->offsets;
				2056
				2057	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				2058	stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
				2059	} else {
				2060	stateTable=cnv->sharedData->mbcs.stateTable;
				2061	}
				2062
				2063	/* sourceIndex=-1 if the current character began in the previous buffer */
				2064	sourceIndex=0;
				2065
				2066	/* conversion loop */
				2067	while(source<sourceLimit) {
				2068	/*
				2069	* This following test is to see if available input would overflow the output.
				2070	* It does not catch output of more than one code unit that
				2071	* overflows as a result of a surrogate pair or callback output
				2072	* from the last source byte.
				2073	* Therefore, those situations also test for overflows and will
				2074	* then break the loop, too.
				2075	*/
				2076	if(target>=targetLimit) {
				2077	/* target is full */
				2078	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				2079	break;
				2080	}
				2081
				2082	entry=stateTable[0][*source++];
				2083	/* MBCS_ENTRY_IS_FINAL(entry) */
				2084
				2085	/* test the most common case first */
				2086	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
				2087	/* output BMP code point */
				2088	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2089	if(offsets!=NULL) {
				2090	*offsets++=sourceIndex;
				2091	}
				2092
				2093	/* normal end of action codes: prepare for a new character */
				2094	++sourceIndex;
				2095	continue;
				2096	}
				2097
				2098	/*
				2099	* An if-else-if chain provides more reliable performance for
				2100	* the most common cases compared to a switch.
				2101	*/
				2102	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				2103	if(action==MBCS_STATE_VALID_DIRECT_20 \|\|
				2104	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
				2105	) {
				2106	entry=MBCS_ENTRY_FINAL_VALUE(entry);
				2107	/* output surrogate pair */
				2108	*target++=(UChar)(0xd800\|(UChar)(entry>>10));
				2109	if(offsets!=NULL) {
				2110	*offsets++=sourceIndex;
				2111	}
				2112	c=(UChar)(0xdc00\|(UChar)(entry&0x3ff));
				2113	if(target<targetLimit) {
				2114	*target++=c;
				2115	if(offsets!=NULL) {
				2116	*offsets++=sourceIndex;
				2117	}
				2118	} else {
				2119	/* target overflow */
				2120	cnv->UCharErrorBuffer[0]=c;
				2121	cnv->UCharErrorBufferLength=1;
				2122	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				2123	break;
				2124	}
				2125
				2126	++sourceIndex;
				2127	continue;
				2128	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
				2129	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
				2130	/* output BMP code point */
				2131	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2132	if(offsets!=NULL) {
				2133	*offsets++=sourceIndex;
				2134	}
				2135
				2136	++sourceIndex;
				2137	continue;
				2138	}
				2139	} else if(action==MBCS_STATE_UNASSIGNED) {
				2140	/* just fall through */
				2141	} else if(action==MBCS_STATE_ILLEGAL) {
				2142	/* callback(illegal) */
				2143	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				2144	} else {
				2145	/* reserved, must never occur */
				2146	++sourceIndex;
				2147	continue;
				2148	}
				2149
				2150	if(U_FAILURE(*pErrorCode)) {
				2151	/* callback(illegal) */
				2152	break;
				2153	} else /* unassigned sequences indicated with byteIndex>0 */ {
				2154	/* try an extension mapping */
				2155	pArgs->source=(const char *)source;
				2156	cnv->toUBytes[0]=*(source-1);
				2157	cnv->toULength=_extToU(cnv, cnv->sharedData,
				2158	1, &source, sourceLimit,
				2159	&target, targetLimit,
				2160	&offsets, sourceIndex,
				2161	pArgs->flush,
				2162	pErrorCode);
				2163	sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
				2164
				2165	if(U_FAILURE(*pErrorCode)) {
				2166	/* not mappable or buffer overflow */
				2167	break;
				2168	}
				2169	}
				2170	}
				2171
				2172	/* write back the updated pointers */
				2173	pArgs->source=(const char *)source;
				2174	pArgs->target=target;
				2175	pArgs->offsets=offsets;
				2176	}
				2177
				2178	/*
				2179	* This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
				2180	* that only map to and from the BMP.
				2181	* In addition to single-byte optimizations, the offset calculations
				2182	* become much easier.
				2183	*/
				2184	static void
				2185	ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
				2186	UErrorCode *pErrorCode) {
				2187	UConverter *cnv;
				2188	const uint8_t source, sourceLimit, *lastSource;
				2189	UChar *target;
				2190	int32_t targetCapacity, length;
				2191	int32_t *offsets;
				2192
				2193	const int32_t (*stateTable)[256];
				2194
				2195	int32_t sourceIndex;
				2196
				2197	int32_t entry;
				2198	uint8_t action;
				2199
				2200	/* set up the local pointers */
				2201	cnv=pArgs->converter;
				2202	source=(const uint8_t *)pArgs->source;
				2203	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				2204	target=pArgs->target;
				2205	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
				2206	offsets=pArgs->offsets;
				2207
				2208	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				2209	stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
				2210	} else {
				2211	stateTable=cnv->sharedData->mbcs.stateTable;
				2212	}
				2213
				2214	/* sourceIndex=-1 if the current character began in the previous buffer */
				2215	sourceIndex=0;
				2216	lastSource=source;
				2217
				2218	/*
				2219	* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
				2220	* for the minimum of the sourceLength and targetCapacity
				2221	*/
				2222	length=(int32_t)(sourceLimit-source);
				2223	if(length<targetCapacity) {
				2224	targetCapacity=length;
				2225	}
				2226
				2227	#if MBCS_UNROLL_SINGLE_TO_BMP
				2228	/* unrolling makes it faster on Pentium III/Windows 2000 */
				2229	/* unroll the loop with the most common case */
				2230	unrolled:
				2231	if(targetCapacity>=16) {
				2232	int32_t count, loops, oredEntries;
				2233
				2234	loops=count=targetCapacity>>4;
				2235	do {
				2236	oredEntries=entry=stateTable[0][*source++];
				2237	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2238	oredEntries\|=entry=stateTable[0][*source++];
				2239	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2240	oredEntries\|=entry=stateTable[0][*source++];
				2241	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2242	oredEntries\|=entry=stateTable[0][*source++];
				2243	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2244	oredEntries\|=entry=stateTable[0][*source++];
				2245	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2246	oredEntries\|=entry=stateTable[0][*source++];
				2247	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2248	oredEntries\|=entry=stateTable[0][*source++];
				2249	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2250	oredEntries\|=entry=stateTable[0][*source++];
				2251	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2252	oredEntries\|=entry=stateTable[0][*source++];
				2253	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2254	oredEntries\|=entry=stateTable[0][*source++];
				2255	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2256	oredEntries\|=entry=stateTable[0][*source++];
				2257	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2258	oredEntries\|=entry=stateTable[0][*source++];
				2259	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2260	oredEntries\|=entry=stateTable[0][*source++];
				2261	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2262	oredEntries\|=entry=stateTable[0][*source++];
				2263	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2264	oredEntries\|=entry=stateTable[0][*source++];
				2265	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2266	oredEntries\|=entry=stateTable[0][*source++];
				2267	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2268
				2269	/* were all 16 entries really valid? */
				2270	if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
				2271	/* no, return to the first of these 16 */
				2272	source-=16;
				2273	target-=16;
				2274	break;
				2275	}
				2276	} while(--count>0);
				2277	count=loops-count;
				2278	targetCapacity-=16*count;
				2279
				2280	if(offsets!=NULL) {
				2281	lastSource+=16*count;
				2282	while(count>0) {
				2283	*offsets++=sourceIndex++;
				2284	*offsets++=sourceIndex++;
				2285	*offsets++=sourceIndex++;
				2286	*offsets++=sourceIndex++;
				2287	*offsets++=sourceIndex++;
				2288	*offsets++=sourceIndex++;
				2289	*offsets++=sourceIndex++;
				2290	*offsets++=sourceIndex++;
				2291	*offsets++=sourceIndex++;
				2292	*offsets++=sourceIndex++;
				2293	*offsets++=sourceIndex++;
				2294	*offsets++=sourceIndex++;
				2295	*offsets++=sourceIndex++;
				2296	*offsets++=sourceIndex++;
				2297	*offsets++=sourceIndex++;
				2298	*offsets++=sourceIndex++;
				2299	--count;
				2300	}
				2301	}
				2302	}
				2303	#endif
				2304
				2305	/* conversion loop */
				2306	while(targetCapacity > 0 && source < sourceLimit) {
				2307	entry=stateTable[0][*source++];
				2308	/* MBCS_ENTRY_IS_FINAL(entry) */
				2309
				2310	/* test the most common case first */
				2311	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
				2312	/* output BMP code point */
				2313	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2314	--targetCapacity;
				2315	continue;
				2316	}
				2317
				2318	/*
				2319	* An if-else-if chain provides more reliable performance for
				2320	* the most common cases compared to a switch.
				2321	*/
				2322	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				2323	if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
				2324	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
				2325	/* output BMP code point */
				2326	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2327	--targetCapacity;
				2328	continue;
				2329	}
				2330	} else if(action==MBCS_STATE_UNASSIGNED) {
				2331	/* just fall through */
				2332	} else if(action==MBCS_STATE_ILLEGAL) {
				2333	/* callback(illegal) */
				2334	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				2335	} else {
				2336	/* reserved, must never occur */
				2337	continue;
				2338	}
				2339
				2340	/* set offsets since the start or the last extension */
				2341	if(offsets!=NULL) {
				2342	int32_t count=(int32_t)(source-lastSource);
				2343
				2344	/* predecrement: do not set the offset for the callback-causing character */
				2345	while(--count>0) {
				2346	*offsets++=sourceIndex++;
				2347	}
				2348	/* offset and sourceIndex are now set for the current character */
				2349	}
				2350
				2351	if(U_FAILURE(*pErrorCode)) {
				2352	/* callback(illegal) */
				2353	break;
				2354	} else /* unassigned sequences indicated with byteIndex>0 */ {
				2355	/* try an extension mapping */
				2356	lastSource=source;
				2357	cnv->toUBytes[0]=*(source-1);
				2358	cnv->toULength=_extToU(cnv, cnv->sharedData,
				2359	1, &source, sourceLimit,
				2360	&target, pArgs->targetLimit,
				2361	&offsets, sourceIndex,
				2362	pArgs->flush,
				2363	pErrorCode);
				2364	sourceIndex+=1+(int32_t)(source-lastSource);
				2365
				2366	if(U_FAILURE(*pErrorCode)) {
				2367	/* not mappable or buffer overflow */
				2368	break;
				2369	}
				2370
				2371	/* recalculate the targetCapacity after an extension mapping */
				2372	targetCapacity=(int32_t)(pArgs->targetLimit-target);
				2373	length=(int32_t)(sourceLimit-source);
				2374	if(length<targetCapacity) {
				2375	targetCapacity=length;
				2376	}
				2377	}
				2378
				2379	#if MBCS_UNROLL_SINGLE_TO_BMP
				2380	/* unrolling makes it faster on Pentium III/Windows 2000 */
				2381	goto unrolled;
				2382	#endif
				2383	}
				2384
				2385	if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
				2386	/* target is full */
				2387	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				2388	}
				2389
				2390	/* set offsets since the start or the last callback */
				2391	if(offsets!=NULL) {
				2392	size_t count=source-lastSource;
				2393	while(count>0) {
				2394	*offsets++=sourceIndex++;
				2395	--count;
				2396	}
				2397	}
				2398
				2399	/* write back the updated pointers */
				2400	pArgs->source=(const char *)source;
				2401	pArgs->target=target;
				2402	pArgs->offsets=offsets;
				2403	}
				2404
				2405	static UBool
				2406	hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
				2407	const int32_t *row=stateTable[state];
				2408	int32_t b, entry;
				2409	/* First test for final entries in this state for some commonly valid byte values. */
				2410	entry=row[0xa1];
				2411	if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
				2412	MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
				2413	) {
				2414	return TRUE;
				2415	}
				2416	entry=row[0x41];
				2417	if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
				2418	MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
				2419	) {
				2420	return TRUE;
				2421	}
				2422	/* Then test for final entries in this state. */
				2423	for(b=0; b<=0xff; ++b) {
				2424	entry=row[b];
				2425	if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
				2426	MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
				2427	) {
				2428	return TRUE;
				2429	}
				2430	}
				2431	/* Then recurse for transition entries. */
				2432	for(b=0; b<=0xff; ++b) {
				2433	entry=row[b];
				2434	if( MBCS_ENTRY_IS_TRANSITION(entry) &&
				2435	hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
				2436	) {
				2437	return TRUE;
				2438	}
				2439	}
				2440	return FALSE;
				2441	}
				2442
				2443	/*
				2444	* Is byte b a single/lead byte in this state?
				2445	* Recurse for transition states, because here we don't want to say that
				2446	* b is a lead byte if all byte sequences that start with b are illegal.
				2447	*/
				2448	static UBool
				2449	isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
				2450	const int32_t *row=stateTable[state];
				2451	int32_t entry=row[b];
				2452	if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
				2453	return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
				2454	} else {
				2455	uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				2456	if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
				2457	return FALSE; /* SI/SO are illegal for DBCS-only conversion */
				2458	} else {
				2459	return action!=MBCS_STATE_ILLEGAL;
				2460	}
				2461	}
				2462	}
				2463
				2464	U_CFUNC void
				2465	ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				2466	UErrorCode *pErrorCode) {
				2467	UConverter *cnv;
				2468	const uint8_t source, sourceLimit;
				2469	UChar *target;
				2470	const UChar *targetLimit;
				2471	int32_t *offsets;
				2472
				2473	const int32_t (*stateTable)[256];
				2474	const uint16_t *unicodeCodeUnits;
				2475
				2476	uint32_t offset;
				2477	uint8_t state;
				2478	int8_t byteIndex;
				2479	uint8_t *bytes;
				2480
				2481	int32_t sourceIndex, nextSourceIndex;
				2482
				2483	int32_t entry;
				2484	UChar c;
				2485	uint8_t action;
				2486
				2487	/* use optimized function if possible */
				2488	cnv=pArgs->converter;
				2489
				2490	if(cnv->preToULength>0) {
				2491	/*
				2492	* pass sourceIndex=-1 because we continue from an earlier buffer
				2493	* in the future, this may change with continuous offsets
				2494	*/
				2495	ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
				2496
				2497	if(U_FAILURE(*pErrorCode) \|\| cnv->preToULength<0) {
				2498	return;
				2499	}
				2500	}
				2501
				2502	if(cnv->sharedData->mbcs.countStates==1) {
				2503	if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
				2504	ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
				2505	} else {
				2506	ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
				2507	}
				2508	return;
				2509	}
				2510
				2511	/* set up the local pointers */
				2512	source=(const uint8_t *)pArgs->source;
				2513	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				2514	target=pArgs->target;
				2515	targetLimit=pArgs->targetLimit;
				2516	offsets=pArgs->offsets;
				2517
				2518	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				2519	stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
				2520	} else {
				2521	stateTable=cnv->sharedData->mbcs.stateTable;
				2522	}
				2523	unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
				2524
				2525	/* get the converter state from UConverter */
				2526	offset=cnv->toUnicodeStatus;
				2527	byteIndex=cnv->toULength;
				2528	bytes=cnv->toUBytes;
				2529
				2530	/*
				2531	* if we are in the SBCS state for a DBCS-only converter,
				2532	* then load the DBCS state from the MBCS data
				2533	* (dbcsOnlyState==0 if it is not a DBCS-only converter)
				2534	*/
				2535	if((state=(uint8_t)(cnv->mode))==0) {
				2536	state=cnv->sharedData->mbcs.dbcsOnlyState;
				2537	}
				2538
				2539	/* sourceIndex=-1 if the current character began in the previous buffer */
				2540	sourceIndex=byteIndex==0 ? 0 : -1;
				2541	nextSourceIndex=0;
				2542
				2543	/* conversion loop */
				2544	while(source<sourceLimit) {
				2545	/*
				2546	* This following test is to see if available input would overflow the output.
				2547	* It does not catch output of more than one code unit that
				2548	* overflows as a result of a surrogate pair or callback output
				2549	* from the last source byte.
				2550	* Therefore, those situations also test for overflows and will
				2551	* then break the loop, too.
				2552	*/
				2553	if(target>=targetLimit) {
				2554	/* target is full */
				2555	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				2556	break;
				2557	}
				2558
				2559	if(byteIndex==0) {
				2560	/* optimized loop for 1/2-byte input and BMP output */
				2561	if(offsets==NULL) {
				2562	do {
				2563	entry=stateTable[state][*source];
				2564	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				2565	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
				2566	offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
				2567
				2568	++source;
				2569	if( source<sourceLimit &&
				2570	MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
				2571	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
				2572	(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
				2573	) {
				2574	++source;
				2575	*target++=c;
				2576	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
				2577	offset=0;
				2578	} else {
				2579	/* set the state and leave the optimized loop */
				2580	bytes[0]=*(source-1);
				2581	byteIndex=1;
				2582	break;
				2583	}
				2584	} else {
				2585	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
				2586	/* output BMP code point */
				2587	++source;
				2588	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2589	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
				2590	} else {
				2591	/* leave the optimized loop */
				2592	break;
				2593	}
				2594	}
				2595	} while(source<sourceLimit && target<targetLimit);
				2596	} else /* offsets!=NULL */ {
				2597	do {
				2598	entry=stateTable[state][*source];
				2599	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				2600	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
				2601	offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
				2602
				2603	++source;
				2604	if( source<sourceLimit &&
				2605	MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
				2606	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
				2607	(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
				2608	) {
				2609	++source;
				2610	*target++=c;
				2611	if(offsets!=NULL) {
				2612	*offsets++=sourceIndex;
				2613	sourceIndex=(nextSourceIndex+=2);
				2614	}
				2615	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
				2616	offset=0;
				2617	} else {
				2618	/* set the state and leave the optimized loop */
				2619	++nextSourceIndex;
				2620	bytes[0]=*(source-1);
				2621	byteIndex=1;
				2622	break;
				2623	}
				2624	} else {
				2625	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
				2626	/* output BMP code point */
				2627	++source;
				2628	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2629	if(offsets!=NULL) {
				2630	*offsets++=sourceIndex;
				2631	sourceIndex=++nextSourceIndex;
				2632	}
				2633	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
				2634	} else {
				2635	/* leave the optimized loop */
				2636	break;
				2637	}
				2638	}
				2639	} while(source<sourceLimit && target<targetLimit);
				2640	}
				2641
				2642	/*
				2643	* these tests and break statements could be put inside the loop
				2644	* if C had "break outerLoop" like Java
				2645	*/
				2646	if(source>=sourceLimit) {
				2647	break;
				2648	}
				2649	if(target>=targetLimit) {
				2650	/* target is full */
				2651	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				2652	break;
				2653	}
				2654
				2655	++nextSourceIndex;
				2656	bytes[byteIndex++]=*source++;
				2657	} else /* byteIndex>0 */ {
				2658	++nextSourceIndex;
				2659	entry=stateTable[state][bytes[byteIndex++]=*source++];
				2660	}
				2661
				2662	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				2663	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
				2664	offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
				2665	continue;
				2666	}
				2667
				2668	/* save the previous state for proper extension mapping with SI/SO-stateful converters */
				2669	cnv->mode=state;
				2670
				2671	/* set the next state early so that we can reuse the entry variable */
				2672	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
				2673
				2674	/*
				2675	* An if-else-if chain provides more reliable performance for
				2676	* the most common cases compared to a switch.
				2677	*/
				2678	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				2679	if(action==MBCS_STATE_VALID_16) {
				2680	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
				2681	c=unicodeCodeUnits[offset];
				2682	if(c<0xfffe) {
				2683	/* output BMP code point */
				2684	*target++=c;
				2685	if(offsets!=NULL) {
				2686	*offsets++=sourceIndex;
				2687	}
				2688	byteIndex=0;
				2689	} else if(c==0xfffe) {
				2690	if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
				2691	/* output fallback BMP code point */
				2692	*target++=(UChar)entry;
				2693	if(offsets!=NULL) {
				2694	*offsets++=sourceIndex;
				2695	}
				2696	byteIndex=0;
				2697	}
				2698	} else {
				2699	/* callback(illegal) */
				2700	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				2701	}
				2702	} else if(action==MBCS_STATE_VALID_DIRECT_16) {
				2703	/* output BMP code point */
				2704	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2705	if(offsets!=NULL) {
				2706	*offsets++=sourceIndex;
				2707	}
				2708	byteIndex=0;
				2709	} else if(action==MBCS_STATE_VALID_16_PAIR) {
				2710	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
				2711	c=unicodeCodeUnits[offset++];
				2712	if(c<0xd800) {
				2713	/* output BMP code point below 0xd800 */
				2714	*target++=c;
				2715	if(offsets!=NULL) {
				2716	*offsets++=sourceIndex;
				2717	}
				2718	byteIndex=0;
				2719	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
				2720	/* output roundtrip or fallback surrogate pair */
				2721	*target++=(UChar)(c&0xdbff);
				2722	if(offsets!=NULL) {
				2723	*offsets++=sourceIndex;
				2724	}
				2725	byteIndex=0;
				2726	if(target<targetLimit) {
				2727	*target++=unicodeCodeUnits[offset];
				2728	if(offsets!=NULL) {
				2729	*offsets++=sourceIndex;
				2730	}
				2731	} else {
				2732	/* target overflow */
				2733	cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
				2734	cnv->UCharErrorBufferLength=1;
				2735	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				2736
				2737	offset=0;
				2738	break;
				2739	}
				2740	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
				2741	/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
				2742	*target++=unicodeCodeUnits[offset];
				2743	if(offsets!=NULL) {
				2744	*offsets++=sourceIndex;
				2745	}
				2746	byteIndex=0;
				2747	} else if(c==0xffff) {
				2748	/* callback(illegal) */
				2749	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				2750	}
				2751	} else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|
				2752	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
				2753	) {
				2754	entry=MBCS_ENTRY_FINAL_VALUE(entry);
				2755	/* output surrogate pair */
				2756	*target++=(UChar)(0xd800\|(UChar)(entry>>10));
				2757	if(offsets!=NULL) {
				2758	*offsets++=sourceIndex;
				2759	}
				2760	byteIndex=0;
				2761	c=(UChar)(0xdc00\|(UChar)(entry&0x3ff));
				2762	if(target<targetLimit) {
				2763	*target++=c;
				2764	if(offsets!=NULL) {
				2765	*offsets++=sourceIndex;
				2766	}
				2767	} else {
				2768	/* target overflow */
				2769	cnv->UCharErrorBuffer[0]=c;
				2770	cnv->UCharErrorBufferLength=1;
				2771	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				2772
				2773	offset=0;
				2774	break;
				2775	}
				2776	} else if(action==MBCS_STATE_CHANGE_ONLY) {
				2777	/*
				2778	* This serves as a state change without any output.
				2779	* It is useful for reading simple stateful encodings,
				2780	* for example using just Shift-In/Shift-Out codes.
				2781	* The 21 unused bits may later be used for more sophisticated
				2782	* state transitions.
				2783	*/
				2784	if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
				2785	byteIndex=0;
				2786	} else {
				2787	/* SI/SO are illegal for DBCS-only conversion */
				2788	state=(uint8_t)(cnv->mode); /* restore the previous state */
				2789
				2790	/* callback(illegal) */
				2791	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				2792	}
				2793	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
				2794	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
				2795	/* output BMP code point */
				2796	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2797	if(offsets!=NULL) {
				2798	*offsets++=sourceIndex;
				2799	}
				2800	byteIndex=0;
				2801	}
				2802	} else if(action==MBCS_STATE_UNASSIGNED) {
				2803	/* just fall through */
				2804	} else if(action==MBCS_STATE_ILLEGAL) {
				2805	/* callback(illegal) */
				2806	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				2807	} else {
				2808	/* reserved, must never occur */
				2809	byteIndex=0;
				2810	}
				2811
				2812	/* end of action codes: prepare for a new character */
				2813	offset=0;
				2814
				2815	if(byteIndex==0) {
				2816	sourceIndex=nextSourceIndex;
				2817	} else if(U_FAILURE(*pErrorCode)) {
				2818	/* callback(illegal) */
				2819	if(byteIndex>1) {
				2820	/*
				2821	* Ticket 5691: consistent illegal sequences:
				2822	* - We include at least the first byte in the illegal sequence.
				2823	* - If any of the non-initial bytes could be the start of a character,
				2824	* we stop the illegal sequence before the first one of those.
				2825	*/
				2826	UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
				2827	int8_t i;
				2828	for(i=1;
				2829	i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
				2830	++i) {}
				2831	if(i<byteIndex) {
				2832	/* Back out some bytes. */
				2833	int8_t backOutDistance=byteIndex-i;
				2834	int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
				2835	byteIndex=i; /* length of reported illegal byte sequence */
				2836	if(backOutDistance<=bytesFromThisBuffer) {
				2837	source-=backOutDistance;
				2838	} else {
				2839	/* Back out bytes from the previous buffer: Need to replay them. */
				2840	cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
				2841	/* preToULength is negative! */
				2842	uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
				2843	source=(const uint8_t *)pArgs->source;
				2844	}
				2845	}
				2846	}
				2847	break;
				2848	} else /* unassigned sequences indicated with byteIndex>0 */ {
				2849	/* try an extension mapping */
				2850	pArgs->source=(const char *)source;
				2851	byteIndex=_extToU(cnv, cnv->sharedData,
				2852	byteIndex, &source, sourceLimit,
				2853	&target, targetLimit,
				2854	&offsets, sourceIndex,
				2855	pArgs->flush,
				2856	pErrorCode);
				2857	sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
				2858
				2859	if(U_FAILURE(*pErrorCode)) {
				2860	/* not mappable or buffer overflow */
				2861	break;
				2862	}
				2863	}
				2864	}
				2865
				2866	/* set the converter state back into UConverter */
				2867	cnv->toUnicodeStatus=offset;
				2868	cnv->mode=state;
				2869	cnv->toULength=byteIndex;
				2870
				2871	/* write back the updated pointers */
				2872	pArgs->source=(const char *)source;
				2873	pArgs->target=target;
				2874	pArgs->offsets=offsets;
				2875	}
				2876
				2877	/*
				2878	* This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
				2879	* We still need a conversion loop in case we find reserved action codes, which are to be ignored.
				2880	*/
				2881	static UChar32
				2882	ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
				2883	UErrorCode *pErrorCode) {
				2884	UConverter *cnv;
				2885	const int32_t (*stateTable)[256];
				2886	const uint8_t source, sourceLimit;
				2887
				2888	int32_t entry;
				2889	uint8_t action;
				2890
				2891	/* set up the local pointers */
				2892	cnv=pArgs->converter;
				2893	source=(const uint8_t *)pArgs->source;
				2894	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				2895	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				2896	stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
				2897	} else {
				2898	stateTable=cnv->sharedData->mbcs.stateTable;
				2899	}
				2900
				2901	/* conversion loop */
				2902	while(source<sourceLimit) {
				2903	entry=stateTable[0][*source++];
				2904	/* MBCS_ENTRY_IS_FINAL(entry) */
				2905
				2906	/* write back the updated pointer early so that we can return directly */
				2907	pArgs->source=(const char *)source;
				2908
				2909	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
				2910	/* output BMP code point */
				2911	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2912	}
				2913
				2914	/*
				2915	* An if-else-if chain provides more reliable performance for
				2916	* the most common cases compared to a switch.
				2917	*/
				2918	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				2919	if( action==MBCS_STATE_VALID_DIRECT_20 \|\|
				2920	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
				2921	) {
				2922	/* output supplementary code point */
				2923	return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
				2924	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
				2925	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
				2926	/* output BMP code point */
				2927	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				2928	}
				2929	} else if(action==MBCS_STATE_UNASSIGNED) {
				2930	/* just fall through */
				2931	} else if(action==MBCS_STATE_ILLEGAL) {
				2932	/* callback(illegal) */
				2933	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				2934	} else {
				2935	/* reserved, must never occur */
				2936	continue;
				2937	}
				2938
				2939	if(U_FAILURE(*pErrorCode)) {
				2940	/* callback(illegal) */
				2941	break;
				2942	} else /* unassigned sequence */ {
				2943	/* defer to the generic implementation */
				2944	pArgs->source=(const char *)source-1;
				2945	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
				2946	}
				2947	}
				2948
				2949	/* no output because of empty input or only state changes */
				2950	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				2951	return 0xffff;
				2952	}
				2953
				2954	/*
				2955	* Version of _MBCSToUnicodeWithOffsets() optimized for single-character
				2956	* conversion without offset handling.
				2957	*
				2958	* When a character does not have a mapping to Unicode, then we return to the
				2959	* generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
				2960	* handling.
				2961	* We also defer to the generic code in other complicated cases and have them
				2962	* ultimately handled by _MBCSToUnicodeWithOffsets() itself.
				2963	*
				2964	* All normal mappings and errors are handled here.
				2965	*/
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2966	static UChar32 U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	2967	ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
				2968	UErrorCode *pErrorCode) {
				2969	UConverter *cnv;
				2970	const uint8_t source, sourceLimit, *lastSource;
				2971
				2972	const int32_t (*stateTable)[256];
				2973	const uint16_t *unicodeCodeUnits;
				2974
				2975	uint32_t offset;
				2976	uint8_t state;
				2977
				2978	int32_t entry;
				2979	UChar32 c;
				2980	uint8_t action;
				2981
				2982	/* use optimized function if possible */
				2983	cnv=pArgs->converter;
				2984
				2985	if(cnv->preToULength>0) {
				2986	/* use the generic code in ucnv_getNextUChar() to continue with a partial match */
				2987	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
				2988	}
				2989
				2990	if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
				2991	/*
				2992	* Using the generic ucnv_getNextUChar() code lets us deal correctly
				2993	* with the rare case of a codepage that maps single surrogates
				2994	* without adding the complexity to this already complicated function here.
				2995	*/
				2996	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
				2997	} else if(cnv->sharedData->mbcs.countStates==1) {
				2998	return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
				2999	}
				3000
				3001	/* set up the local pointers */
				3002	source=lastSource=(const uint8_t *)pArgs->source;
				3003	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				3004
				3005	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				3006	stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
				3007	} else {
				3008	stateTable=cnv->sharedData->mbcs.stateTable;
				3009	}
				3010	unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
				3011
				3012	/* get the converter state from UConverter */
				3013	offset=cnv->toUnicodeStatus;
				3014
				3015	/*
				3016	* if we are in the SBCS state for a DBCS-only converter,
				3017	* then load the DBCS state from the MBCS data
				3018	* (dbcsOnlyState==0 if it is not a DBCS-only converter)
				3019	*/
				3020	if((state=(uint8_t)(cnv->mode))==0) {
				3021	state=cnv->sharedData->mbcs.dbcsOnlyState;
				3022	}
				3023
				3024	/* conversion loop */
				3025	c=U_SENTINEL;
				3026	while(source<sourceLimit) {
				3027	entry=stateTable[state][*source++];
				3028	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				3029	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
				3030	offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
				3031
				3032	/* optimization for 1/2-byte input and BMP output */
				3033	if( source<sourceLimit &&
				3034	MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
				3035	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
				3036	(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
				3037	) {
				3038	++source;
				3039	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
				3040	/* output BMP code point */
				3041	break;
				3042	}
				3043	} else {
				3044	/* save the previous state for proper extension mapping with SI/SO-stateful converters */
				3045	cnv->mode=state;
				3046
				3047	/* set the next state early so that we can reuse the entry variable */
				3048	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
				3049
				3050	/*
				3051	* An if-else-if chain provides more reliable performance for
				3052	* the most common cases compared to a switch.
				3053	*/
				3054	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				3055	if(action==MBCS_STATE_VALID_DIRECT_16) {
				3056	/* output BMP code point */
				3057	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				3058	break;
				3059	} else if(action==MBCS_STATE_VALID_16) {
				3060	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
				3061	c=unicodeCodeUnits[offset];
				3062	if(c<0xfffe) {
				3063	/* output BMP code point */
				3064	break;
				3065	} else if(c==0xfffe) {
				3066	if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
				3067	break;
				3068	}
				3069	} else {
				3070	/* callback(illegal) */
				3071	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3072	}
				3073	} else if(action==MBCS_STATE_VALID_16_PAIR) {
				3074	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
				3075	c=unicodeCodeUnits[offset++];
				3076	if(c<0xd800) {
				3077	/* output BMP code point below 0xd800 */
				3078	break;
				3079	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
				3080	/* output roundtrip or fallback supplementary code point */
				3081	c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
				3082	break;
				3083	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
				3084	/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
				3085	c=unicodeCodeUnits[offset];
				3086	break;
				3087	} else if(c==0xffff) {
				3088	/* callback(illegal) */
				3089	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3090	}
				3091	} else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|
				3092	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
				3093	) {
				3094	/* output supplementary code point */
				3095	c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
				3096	break;
				3097	} else if(action==MBCS_STATE_CHANGE_ONLY) {
				3098	/*
				3099	* This serves as a state change without any output.
				3100	* It is useful for reading simple stateful encodings,
				3101	* for example using just Shift-In/Shift-Out codes.
				3102	* The 21 unused bits may later be used for more sophisticated
				3103	* state transitions.
				3104	*/
				3105	if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
				3106	/* SI/SO are illegal for DBCS-only conversion */
				3107	state=(uint8_t)(cnv->mode); /* restore the previous state */
				3108
				3109	/* callback(illegal) */
				3110	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3111	}
				3112	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
				3113	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
				3114	/* output BMP code point */
				3115	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				3116	break;
				3117	}
				3118	} else if(action==MBCS_STATE_UNASSIGNED) {
				3119	/* just fall through */
				3120	} else if(action==MBCS_STATE_ILLEGAL) {
				3121	/* callback(illegal) */
				3122	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3123	} else {
				3124	/* reserved (must never occur), or only state change */
				3125	offset=0;
				3126	lastSource=source;
				3127	continue;
				3128	}
				3129
				3130	/* end of action codes: prepare for a new character */
				3131	offset=0;
				3132
				3133	if(U_FAILURE(*pErrorCode)) {
				3134	/* callback(illegal) */
				3135	break;
				3136	} else /* unassigned sequence */ {
				3137	/* defer to the generic implementation */
				3138	cnv->toUnicodeStatus=0;
				3139	cnv->mode=state;
				3140	pArgs->source=(const char *)lastSource;
				3141	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
				3142	}
				3143	}
				3144	}
				3145
				3146	if(c<0) {
				3147	if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
				3148	/* incomplete character byte sequence */
				3149	uint8_t *bytes=cnv->toUBytes;
				3150	cnv->toULength=(int8_t)(source-lastSource);
				3151	do {
				3152	bytes++=lastSource++;
				3153	} while(lastSource<source);
				3154	*pErrorCode=U_TRUNCATED_CHAR_FOUND;
				3155	} else if(U_FAILURE(*pErrorCode)) {
				3156	/* callback(illegal) */
				3157	/*
				3158	* Ticket 5691: consistent illegal sequences:
				3159	* - We include at least the first byte in the illegal sequence.
				3160	* - If any of the non-initial bytes could be the start of a character,
				3161	* we stop the illegal sequence before the first one of those.
				3162	*/
				3163	UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
				3164	uint8_t *bytes=cnv->toUBytes;
				3165	bytes++=lastSource++; /* first byte */
				3166	if(lastSource==source) {
				3167	cnv->toULength=1;
				3168	} else /* lastSource<source: multi-byte character */ {
				3169	int8_t i;
				3170	for(i=1;
				3171	lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
				3172	++i
				3173	) {
				3174	bytes++=lastSource++;
				3175	}
				3176	cnv->toULength=i;
				3177	source=lastSource;
				3178	}
				3179	} else {
				3180	/* no output because of empty input or only state changes */
				3181	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				3182	}
				3183	c=0xffff;
				3184	}
				3185
				3186	/* set the converter state back into UConverter, ready for a new character */
				3187	cnv->toUnicodeStatus=0;
				3188	cnv->mode=state;
				3189
				3190	/* write back the updated pointer */
				3191	pArgs->source=(const char *)source;
				3192	return c;
				3193	}
				3194
				3195	#if 0
				3196	/*
				3197	* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
				3198	* Removal improves code coverage.
				3199	*/
				3200	/**
				3201	* This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
				3202	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
				3203	* It does not handle conversion extensions (_extToU()).
				3204	*/
				3205	U_CFUNC UChar32
				3206	ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
				3207	uint8_t b, UBool useFallback) {
				3208	int32_t entry;
				3209	uint8_t action;
				3210
				3211	entry=sharedData->mbcs.stateTable[0][b];
				3212	/* MBCS_ENTRY_IS_FINAL(entry) */
				3213
				3214	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
				3215	/* output BMP code point */
				3216	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				3217	}
				3218
				3219	/*
				3220	* An if-else-if chain provides more reliable performance for
				3221	* the most common cases compared to a switch.
				3222	*/
				3223	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				3224	if(action==MBCS_STATE_VALID_DIRECT_20) {
				3225	/* output supplementary code point */
				3226	return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
				3227	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
				3228	if(!TO_U_USE_FALLBACK(useFallback)) {
				3229	return 0xfffe;
				3230	}
				3231	/* output BMP code point */
				3232	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				3233	} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
				3234	if(!TO_U_USE_FALLBACK(useFallback)) {
				3235	return 0xfffe;
				3236	}
				3237	/* output supplementary code point */
				3238	return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
				3239	} else if(action==MBCS_STATE_UNASSIGNED) {
				3240	return 0xfffe;
				3241	} else if(action==MBCS_STATE_ILLEGAL) {
				3242	return 0xffff;
				3243	} else {
				3244	/* reserved, must never occur */
				3245	return 0xffff;
				3246	}
				3247	}
				3248	#endif
				3249
				3250	/*
				3251	* This is a simple version of _MBCSGetNextUChar() that is used
				3252	* by other converter implementations.
				3253	* It only returns an "assigned" result if it consumes the entire input.
				3254	* It does not use state from the converter, nor error codes.
				3255	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
				3256	* It handles conversion extensions but not GB 18030.
				3257	*
				3258	* Return value:
				3259	* U+fffe unassigned
				3260	* U+ffff illegal
				3261	* otherwise the Unicode code point
				3262	*/
				3263	U_CFUNC UChar32
				3264	ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
				3265	const char *source, int32_t length,
				3266	UBool useFallback) {
				3267	const int32_t (*stateTable)[256];
				3268	const uint16_t *unicodeCodeUnits;
				3269
				3270	uint32_t offset;
				3271	uint8_t state, action;
				3272
				3273	UChar32 c;
				3274	int32_t i, entry;
				3275
				3276	if(length<=0) {
				3277	/* no input at all: "illegal" */
				3278	return 0xffff;
				3279	}
				3280
				3281	#if 0
				3282	/*
				3283	* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
				3284	* TODO In future releases, verify that this function is never called for SBCS
				3285	* conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
				3286	* Removal improves code coverage.
				3287	*/
				3288	/* use optimized function if possible */
				3289	if(sharedData->mbcs.countStates==1) {
				3290	if(length==1) {
				3291	return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
				3292	} else {
				3293	return 0xffff; /* illegal: more than a single byte for an SBCS converter */
				3294	}
				3295	}
				3296	#endif
				3297
				3298	/* set up the local pointers */
				3299	stateTable=sharedData->mbcs.stateTable;
				3300	unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
				3301
				3302	/* converter state */
				3303	offset=0;
				3304	state=sharedData->mbcs.dbcsOnlyState;
				3305
				3306	/* conversion loop */
				3307	for(i=0;;) {
				3308	entry=stateTable[state][(uint8_t)source[i++]];
				3309	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
				3310	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
				3311	offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
				3312
				3313	if(i==length) {
				3314	return 0xffff; /* truncated character */
				3315	}
				3316	} else {
				3317	/*
				3318	* An if-else-if chain provides more reliable performance for
				3319	* the most common cases compared to a switch.
				3320	*/
				3321	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
				3322	if(action==MBCS_STATE_VALID_16) {
				3323	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
				3324	c=unicodeCodeUnits[offset];
				3325	if(c!=0xfffe) {
				3326	/* done */
				3327	} else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
				3328	c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
				3329	/* else done with 0xfffe */
				3330	}
				3331	break;
				3332	} else if(action==MBCS_STATE_VALID_DIRECT_16) {
				3333	/* output BMP code point */
				3334	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				3335	break;
				3336	} else if(action==MBCS_STATE_VALID_16_PAIR) {
				3337	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
				3338	c=unicodeCodeUnits[offset++];
				3339	if(c<0xd800) {
				3340	/* output BMP code point below 0xd800 */
				3341	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
				3342	/* output roundtrip or fallback supplementary code point */
				3343	c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
				3344	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
				3345	/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
				3346	c=unicodeCodeUnits[offset];
				3347	} else if(c==0xffff) {
				3348	return 0xffff;
				3349	} else {
				3350	c=0xfffe;
				3351	}
				3352	break;
				3353	} else if(action==MBCS_STATE_VALID_DIRECT_20) {
				3354	/* output supplementary code point */
				3355	c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
				3356	break;
				3357	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
				3358	if(!TO_U_USE_FALLBACK(useFallback)) {
				3359	c=0xfffe;
				3360	break;
				3361	}
				3362	/* output BMP code point */
				3363	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
				3364	break;
				3365	} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
				3366	if(!TO_U_USE_FALLBACK(useFallback)) {
				3367	c=0xfffe;
				3368	break;
				3369	}
				3370	/* output supplementary code point */
				3371	c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
				3372	break;
				3373	} else if(action==MBCS_STATE_UNASSIGNED) {
				3374	c=0xfffe;
				3375	break;
				3376	}
				3377
				3378	/*
				3379	* forbid MBCS_STATE_CHANGE_ONLY for this function,
				3380	* and MBCS_STATE_ILLEGAL and reserved action codes
				3381	*/
				3382	return 0xffff;
				3383	}
				3384	}
				3385
				3386	if(i!=length) {
				3387	/* illegal for this function: not all input consumed */
				3388	return 0xffff;
				3389	}
				3390
				3391	if(c==0xfffe) {
				3392	/* try an extension mapping */
				3393	const int32_t *cx=sharedData->mbcs.extIndexes;
				3394	if(cx!=NULL) {
				3395	return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
				3396	}
				3397	}
				3398
				3399	return c;
				3400	}
				3401
				3402	/* MBCS-from-Unicode conversion functions ----------------------------------- */
				3403
				3404	/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
				3405	static void
				3406	ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				3407	UErrorCode *pErrorCode) {
				3408	UConverter *cnv;
				3409	const UChar source, sourceLimit;
				3410	uint8_t *target;
				3411	int32_t targetCapacity;
				3412	int32_t *offsets;
				3413
				3414	const uint16_t *table;
				3415	const uint16_t *mbcsIndex;
				3416	const uint8_t *bytes;
				3417
				3418	UChar32 c;
				3419
				3420	int32_t sourceIndex, nextSourceIndex;
				3421
				3422	uint32_t stage2Entry;
				3423	uint32_t asciiRoundtrips;
				3424	uint32_t value;
				3425	uint8_t unicodeMask;
				3426
				3427	/* use optimized function if possible */
				3428	cnv=pArgs->converter;
				3429	unicodeMask=cnv->sharedData->mbcs.unicodeMask;
				3430
				3431	/* set up the local pointers */
				3432	source=pArgs->source;
				3433	sourceLimit=pArgs->sourceLimit;
				3434	target=(uint8_t *)pArgs->target;
				3435	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
				3436	offsets=pArgs->offsets;
				3437
				3438	table=cnv->sharedData->mbcs.fromUnicodeTable;
				3439	mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
				3440	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				3441	bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
				3442	} else {
				3443	bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
				3444	}
				3445	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
				3446
				3447	/* get the converter state from UConverter */
				3448	c=cnv->fromUChar32;
				3449
				3450	/* sourceIndex=-1 if the current character began in the previous buffer */
				3451	sourceIndex= c==0 ? 0 : -1;
				3452	nextSourceIndex=0;
				3453
				3454	/* conversion loop */
				3455	if(c!=0 && targetCapacity>0) {
				3456	goto getTrail;
				3457	}
				3458
				3459	while(source<sourceLimit) {
				3460	/*
				3461	* This following test is to see if available input would overflow the output.
				3462	* It does not catch output of more than one byte that
				3463	* overflows as a result of a multi-byte character or callback output
				3464	* from the last source character.
				3465	* Therefore, those situations also test for overflows and will
				3466	* then break the loop, too.
				3467	*/
				3468	if(targetCapacity>0) {
				3469	/*
				3470	* Get a correct Unicode code point:
				3471	* a single UChar for a BMP code point or
				3472	* a matched surrogate pair for a "supplementary code point".
				3473	*/
				3474	c=*source++;
				3475	++nextSourceIndex;
				3476	if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
				3477	*target++=(uint8_t)c;
				3478	if(offsets!=NULL) {
				3479	*offsets++=sourceIndex;
				3480	sourceIndex=nextSourceIndex;
				3481	}
				3482	--targetCapacity;
				3483	c=0;
				3484	continue;
				3485	}
				3486	/*
				3487	* utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
				3488	* to avoid dealing with surrogates.
				3489	* MBCS_FAST_MAX must be >=0xd7ff.
				3490	*/
				3491	if(c<=0xd7ff) {
				3492	value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
				3493	/* There are only roundtrips (!=0) and no-mapping (==0) entries. */
				3494	if(value==0) {
				3495	goto unassigned;
				3496	}
				3497	/* output the value */
				3498	} else {
				3499	/*
				3500	* This also tests if the codepage maps single surrogates.
				3501	* If it does, then surrogates are not paired but mapped separately.
				3502	* Note that in this case unmatched surrogates are not detected.
				3503	*/
				3504	if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
				3505	if(U16_IS_SURROGATE_LEAD(c)) {
				3506	getTrail:
				3507	if(source<sourceLimit) {
				3508	/* test the following code unit */
				3509	UChar trail=*source;
				3510	if(U16_IS_TRAIL(trail)) {
				3511	++source;
				3512	++nextSourceIndex;
				3513	c=U16_GET_SUPPLEMENTARY(c, trail);
				3514	if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
				3515	/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
				3516	/* callback(unassigned) */
				3517	goto unassigned;
				3518	}
				3519	/* convert this supplementary code point */
				3520	/* exit this condition tree */
				3521	} else {
				3522	/* this is an unmatched lead code unit (1st surrogate) */
				3523	/* callback(illegal) */
				3524	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3525	break;
				3526	}
				3527	} else {
				3528	/* no more input */
				3529	break;
				3530	}
				3531	} else {
				3532	/* this is an unmatched trail code unit (2nd surrogate) */
				3533	/* callback(illegal) */
				3534	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3535	break;
				3536	}
				3537	}
				3538
				3539	/* convert the Unicode code point in c into codepage bytes */
				3540	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
				3541
				3542	/* get the bytes and the length for the output */
				3543	/* MBCS_OUTPUT_2 */
				3544	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
				3545
				3546	/* is this code point assigned, or do we use fallbacks? */
				3547	if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|
				3548	(UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
				3549	) {
				3550	/*
				3551	* We allow a 0 byte output if the "assigned" bit is set for this entry.
				3552	* There is no way with this data structure for fallback output
				3553	* to be a zero byte.
				3554	*/
				3555
				3556	unassigned:
				3557	/* try an extension mapping */
				3558	pArgs->source=source;
				3559	c=_extFromU(cnv, cnv->sharedData,
				3560	c, &source, sourceLimit,
				3561	&target, target+targetCapacity,
				3562	&offsets, sourceIndex,
				3563	pArgs->flush,
				3564	pErrorCode);
				3565	nextSourceIndex+=(int32_t)(source-pArgs->source);
				3566
				3567	if(U_FAILURE(*pErrorCode)) {
				3568	/* not mappable or buffer overflow */
				3569	break;
				3570	} else {
				3571	/* a mapping was written to the target, continue */
				3572
				3573	/* recalculate the targetCapacity after an extension mapping */
				3574	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
				3575
				3576	/* normal end of conversion: prepare for a new character */
				3577	sourceIndex=nextSourceIndex;
				3578	continue;
				3579	}
				3580	}
				3581	}
				3582
				3583	/* write the output character bytes from value and length */
				3584	/* from the first if in the loop we know that targetCapacity>0 */
				3585	if(value<=0xff) {
				3586	/* this is easy because we know that there is enough space */
				3587	*target++=(uint8_t)value;
				3588	if(offsets!=NULL) {
				3589	*offsets++=sourceIndex;
				3590	}
				3591	--targetCapacity;
				3592	} else /* length==2 */ {
				3593	*target++=(uint8_t)(value>>8);
				3594	if(2<=targetCapacity) {
				3595	*target++=(uint8_t)value;
				3596	if(offsets!=NULL) {
				3597	*offsets++=sourceIndex;
				3598	*offsets++=sourceIndex;
				3599	}
				3600	targetCapacity-=2;
				3601	} else {
				3602	if(offsets!=NULL) {
				3603	*offsets++=sourceIndex;
				3604	}
				3605	cnv->charErrorBuffer[0]=(char)value;
				3606	cnv->charErrorBufferLength=1;
				3607
				3608	/* target overflow */
				3609	targetCapacity=0;
				3610	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				3611	c=0;
				3612	break;
				3613	}
				3614	}
				3615
				3616	/* normal end of conversion: prepare for a new character */
				3617	c=0;
				3618	sourceIndex=nextSourceIndex;
				3619	continue;
				3620	} else {
				3621	/* target is full */
				3622	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				3623	break;
				3624	}
				3625	}
				3626
				3627	/* set the converter state back into UConverter */
				3628	cnv->fromUChar32=c;
				3629
				3630	/* write back the updated pointers */
				3631	pArgs->source=source;
				3632	pArgs->target=(char *)target;
				3633	pArgs->offsets=offsets;
				3634	}
				3635
				3636	/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
				3637	static void
				3638	ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				3639	UErrorCode *pErrorCode) {
				3640	UConverter *cnv;
				3641	const UChar source, sourceLimit;
				3642	uint8_t *target;
				3643	int32_t targetCapacity;
				3644	int32_t *offsets;
				3645
				3646	const uint16_t *table;
				3647	const uint16_t *results;
				3648
				3649	UChar32 c;
				3650
				3651	int32_t sourceIndex, nextSourceIndex;
				3652
				3653	uint16_t value, minValue;
				3654	UBool hasSupplementary;
				3655
				3656	/* set up the local pointers */
				3657	cnv=pArgs->converter;
				3658	source=pArgs->source;
				3659	sourceLimit=pArgs->sourceLimit;
				3660	target=(uint8_t *)pArgs->target;
				3661	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
				3662	offsets=pArgs->offsets;
				3663
				3664	table=cnv->sharedData->mbcs.fromUnicodeTable;
				3665	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				3666	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
				3667	} else {
				3668	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
				3669	}
				3670
				3671	if(cnv->useFallback) {
				3672	/* use all roundtrip and fallback results */
				3673	minValue=0x800;
				3674	} else {
				3675	/* use only roundtrips and fallbacks from private-use characters */
				3676	minValue=0xc00;
				3677	}
				3678	hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
				3679
				3680	/* get the converter state from UConverter */
				3681	c=cnv->fromUChar32;
				3682
				3683	/* sourceIndex=-1 if the current character began in the previous buffer */
				3684	sourceIndex= c==0 ? 0 : -1;
				3685	nextSourceIndex=0;
				3686
				3687	/* conversion loop */
				3688	if(c!=0 && targetCapacity>0) {
				3689	goto getTrail;
				3690	}
				3691
				3692	while(source<sourceLimit) {
				3693	/*
				3694	* This following test is to see if available input would overflow the output.
				3695	* It does not catch output of more than one byte that
				3696	* overflows as a result of a multi-byte character or callback output
				3697	* from the last source character.
				3698	* Therefore, those situations also test for overflows and will
				3699	* then break the loop, too.
				3700	*/
				3701	if(targetCapacity>0) {
				3702	/*
				3703	* Get a correct Unicode code point:
				3704	* a single UChar for a BMP code point or
				3705	* a matched surrogate pair for a "supplementary code point".
				3706	*/
				3707	c=*source++;
				3708	++nextSourceIndex;
				3709	if(U16_IS_SURROGATE(c)) {
				3710	if(U16_IS_SURROGATE_LEAD(c)) {
				3711	getTrail:
				3712	if(source<sourceLimit) {
				3713	/* test the following code unit */
				3714	UChar trail=*source;
				3715	if(U16_IS_TRAIL(trail)) {
				3716	++source;
				3717	++nextSourceIndex;
				3718	c=U16_GET_SUPPLEMENTARY(c, trail);
				3719	if(!hasSupplementary) {
				3720	/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
				3721	/* callback(unassigned) */
				3722	goto unassigned;
				3723	}
				3724	/* convert this supplementary code point */
				3725	/* exit this condition tree */
				3726	} else {
				3727	/* this is an unmatched lead code unit (1st surrogate) */
				3728	/* callback(illegal) */
				3729	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3730	break;
				3731	}
				3732	} else {
				3733	/* no more input */
				3734	break;
				3735	}
				3736	} else {
				3737	/* this is an unmatched trail code unit (2nd surrogate) */
				3738	/* callback(illegal) */
				3739	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3740	break;
				3741	}
				3742	}
				3743
				3744	/* convert the Unicode code point in c into codepage bytes */
				3745	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
				3746
				3747	/* is this code point assigned, or do we use fallbacks? */
				3748	if(value>=minValue) {
				3749	/* assigned, write the output character bytes from value and length */
				3750	/* length==1 */
				3751	/* this is easy because we know that there is enough space */
				3752	*target++=(uint8_t)value;
				3753	if(offsets!=NULL) {
				3754	*offsets++=sourceIndex;
				3755	}
				3756	--targetCapacity;
				3757
				3758	/* normal end of conversion: prepare for a new character */
				3759	c=0;
				3760	sourceIndex=nextSourceIndex;
				3761	} else { /* unassigned */
				3762	unassigned:
				3763	/* try an extension mapping */
				3764	pArgs->source=source;
				3765	c=_extFromU(cnv, cnv->sharedData,
				3766	c, &source, sourceLimit,
				3767	&target, target+targetCapacity,
				3768	&offsets, sourceIndex,
				3769	pArgs->flush,
				3770	pErrorCode);
				3771	nextSourceIndex+=(int32_t)(source-pArgs->source);
				3772
				3773	if(U_FAILURE(*pErrorCode)) {
				3774	/* not mappable or buffer overflow */
				3775	break;
				3776	} else {
				3777	/* a mapping was written to the target, continue */
				3778
				3779	/* recalculate the targetCapacity after an extension mapping */
				3780	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
				3781
				3782	/* normal end of conversion: prepare for a new character */
				3783	sourceIndex=nextSourceIndex;
				3784	}
				3785	}
				3786	} else {
				3787	/* target is full */
				3788	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				3789	break;
				3790	}
				3791	}
				3792
				3793	/* set the converter state back into UConverter */
				3794	cnv->fromUChar32=c;
				3795
				3796	/* write back the updated pointers */
				3797	pArgs->source=source;
				3798	pArgs->target=(char *)target;
				3799	pArgs->offsets=offsets;
				3800	}
				3801
				3802	/*
				3803	* This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
				3804	* that map only to and from the BMP.
				3805	* In addition to single-byte/state optimizations, the offset calculations
				3806	* become much easier.
				3807	* It would be possible to use the sbcsIndex for UTF-8-friendly tables,
				3808	* but measurements have shown that this diminishes performance
				3809	* in more cases than it improves it.
				3810	* See SVN revision 21013 (2007-feb-06) for the last version with #if switches
				3811	* for various MBCS and SBCS optimizations.
				3812	*/
				3813	static void
				3814	ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
				3815	UErrorCode *pErrorCode) {
				3816	UConverter *cnv;
				3817	const UChar source, sourceLimit, *lastSource;
				3818	uint8_t *target;
				3819	int32_t targetCapacity, length;
				3820	int32_t *offsets;
				3821
				3822	const uint16_t *table;
				3823	const uint16_t *results;
				3824
				3825	UChar32 c;
				3826
				3827	int32_t sourceIndex;
				3828
				3829	uint32_t asciiRoundtrips;
				3830	uint16_t value, minValue;
				3831
				3832	/* set up the local pointers */
				3833	cnv=pArgs->converter;
				3834	source=pArgs->source;
				3835	sourceLimit=pArgs->sourceLimit;
				3836	target=(uint8_t *)pArgs->target;
				3837	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
				3838	offsets=pArgs->offsets;
				3839
				3840	table=cnv->sharedData->mbcs.fromUnicodeTable;
				3841	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				3842	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
				3843	} else {
				3844	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
				3845	}
				3846	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
				3847
				3848	if(cnv->useFallback) {
				3849	/* use all roundtrip and fallback results */
				3850	minValue=0x800;
				3851	} else {
				3852	/* use only roundtrips and fallbacks from private-use characters */
				3853	minValue=0xc00;
				3854	}
				3855
				3856	/* get the converter state from UConverter */
				3857	c=cnv->fromUChar32;
				3858
				3859	/* sourceIndex=-1 if the current character began in the previous buffer */
				3860	sourceIndex= c==0 ? 0 : -1;
				3861	lastSource=source;
				3862
				3863	/*
				3864	* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
				3865	* for the minimum of the sourceLength and targetCapacity
				3866	*/
				3867	length=(int32_t)(sourceLimit-source);
				3868	if(length<targetCapacity) {
				3869	targetCapacity=length;
				3870	}
				3871
				3872	/* conversion loop */
				3873	if(c!=0 && targetCapacity>0) {
				3874	goto getTrail;
				3875	}
				3876
				3877	#if MBCS_UNROLL_SINGLE_FROM_BMP
				3878	/* unrolling makes it slower on Pentium III/Windows 2000?! */
				3879	/* unroll the loop with the most common case */
				3880	unrolled:
				3881	if(targetCapacity>=4) {
				3882	int32_t count, loops;
				3883	uint16_t andedValues;
				3884
				3885	loops=count=targetCapacity>>2;
				3886	do {
				3887	c=*source++;
				3888	andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
				3889	*target++=(uint8_t)value;
				3890	c=*source++;
				3891	andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
				3892	*target++=(uint8_t)value;
				3893	c=*source++;
				3894	andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
				3895	*target++=(uint8_t)value;
				3896	c=*source++;
				3897	andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
				3898	*target++=(uint8_t)value;
				3899
				3900	/* were all 4 entries really valid? */
				3901	if(andedValues<minValue) {
				3902	/* no, return to the first of these 4 */
				3903	source-=4;
				3904	target-=4;
				3905	break;
				3906	}
				3907	} while(--count>0);
				3908	count=loops-count;
				3909	targetCapacity-=4*count;
				3910
				3911	if(offsets!=NULL) {
				3912	lastSource+=4*count;
				3913	while(count>0) {
				3914	*offsets++=sourceIndex++;
				3915	*offsets++=sourceIndex++;
				3916	*offsets++=sourceIndex++;
				3917	*offsets++=sourceIndex++;
				3918	--count;
				3919	}
				3920	}
				3921
				3922	c=0;
				3923	}
				3924	#endif
				3925
				3926	while(targetCapacity>0) {
				3927	/*
				3928	* Get a correct Unicode code point:
				3929	* a single UChar for a BMP code point or
				3930	* a matched surrogate pair for a "supplementary code point".
				3931	*/
				3932	c=*source++;
				3933	/*
				3934	* Do not immediately check for single surrogates:
				3935	* Assume that they are unassigned and check for them in that case.
				3936	* This speeds up the conversion of assigned characters.
				3937	*/
				3938	/* convert the Unicode code point in c into codepage bytes */
				3939	if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
				3940	*target++=(uint8_t)c;
				3941	--targetCapacity;
				3942	c=0;
				3943	continue;
				3944	}
				3945	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
				3946	/* is this code point assigned, or do we use fallbacks? */
				3947	if(value>=minValue) {
				3948	/* assigned, write the output character bytes from value and length */
				3949	/* length==1 */
				3950	/* this is easy because we know that there is enough space */
				3951	*target++=(uint8_t)value;
				3952	--targetCapacity;
				3953
				3954	/* normal end of conversion: prepare for a new character */
				3955	c=0;
				3956	continue;
				3957	} else if(!U16_IS_SURROGATE(c)) {
				3958	/* normal, unassigned BMP character */
				3959	} else if(U16_IS_SURROGATE_LEAD(c)) {
				3960	getTrail:
				3961	if(source<sourceLimit) {
				3962	/* test the following code unit */
				3963	UChar trail=*source;
				3964	if(U16_IS_TRAIL(trail)) {
				3965	++source;
				3966	c=U16_GET_SUPPLEMENTARY(c, trail);
				3967	/* this codepage does not map supplementary code points */
				3968	/* callback(unassigned) */
				3969	} else {
				3970	/* this is an unmatched lead code unit (1st surrogate) */
				3971	/* callback(illegal) */
				3972	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3973	break;
				3974	}
				3975	} else {
				3976	/* no more input */
				3977	if (pArgs->flush) {
				3978	*pErrorCode=U_TRUNCATED_CHAR_FOUND;
				3979	}
				3980	break;
				3981	}
				3982	} else {
				3983	/* this is an unmatched trail code unit (2nd surrogate) */
				3984	/* callback(illegal) */
				3985	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				3986	break;
				3987	}
				3988
				3989	/* c does not have a mapping */
				3990
				3991	/* get the number of code units for c to correctly advance sourceIndex */
				3992	length=U16_LENGTH(c);
				3993
				3994	/* set offsets since the start or the last extension */
				3995	if(offsets!=NULL) {
				3996	int32_t count=(int32_t)(source-lastSource);
				3997
				3998	/* do not set the offset for this character */
				3999	count-=length;
				4000
				4001	while(count>0) {
				4002	*offsets++=sourceIndex++;
				4003	--count;
				4004	}
				4005	/* offsets and sourceIndex are now set for the current character */
				4006	}
				4007
				4008	/* try an extension mapping */
				4009	lastSource=source;
				4010	c=_extFromU(cnv, cnv->sharedData,
				4011	c, &source, sourceLimit,
				4012	&target, (const uint8_t *)(pArgs->targetLimit),
				4013	&offsets, sourceIndex,
				4014	pArgs->flush,
				4015	pErrorCode);
				4016	sourceIndex+=length+(int32_t)(source-lastSource);
				4017	lastSource=source;
				4018
				4019	if(U_FAILURE(*pErrorCode)) {
				4020	/* not mappable or buffer overflow */
				4021	break;
				4022	} else {
				4023	/* a mapping was written to the target, continue */
				4024
				4025	/* recalculate the targetCapacity after an extension mapping */
				4026	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
				4027	length=(int32_t)(sourceLimit-source);
				4028	if(length<targetCapacity) {
				4029	targetCapacity=length;
				4030	}
				4031	}
				4032
				4033	#if MBCS_UNROLL_SINGLE_FROM_BMP
				4034	/* unrolling makes it slower on Pentium III/Windows 2000?! */
				4035	goto unrolled;
				4036	#endif
				4037	}
				4038
				4039	if(U_SUCCESS(pErrorCode) && source<sourceLimit && target>=(uint8_t )pArgs->targetLimit) {
				4040	/* target is full */
				4041	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				4042	}
				4043
				4044	/* set offsets since the start or the last callback */
				4045	if(offsets!=NULL) {
				4046	size_t count=source-lastSource;
				4047	if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
				4048	/*
				4049	Caller gave us a partial supplementary character,
				4050	which this function couldn't convert in any case.
				4051	The callback will handle the offset.
				4052	*/
				4053	count--;
				4054	}
				4055	while(count>0) {
				4056	*offsets++=sourceIndex++;
				4057	--count;
				4058	}
				4059	}
				4060
				4061	/* set the converter state back into UConverter */
				4062	cnv->fromUChar32=c;
				4063
				4064	/* write back the updated pointers */
				4065	pArgs->source=source;
				4066	pArgs->target=(char *)target;
				4067	pArgs->offsets=offsets;
				4068	}
				4069
				4070	U_CFUNC void
				4071	ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				4072	UErrorCode *pErrorCode) {
				4073	UConverter *cnv;
				4074	const UChar source, sourceLimit;
				4075	uint8_t *target;
				4076	int32_t targetCapacity;
				4077	int32_t *offsets;
				4078
				4079	const uint16_t *table;
				4080	const uint16_t *mbcsIndex;
				4081	const uint8_t p, bytes;
				4082	uint8_t outputType;
				4083
				4084	UChar32 c;
				4085
				4086	int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
				4087
				4088	uint32_t stage2Entry;
				4089	uint32_t asciiRoundtrips;
				4090	uint32_t value;
				4091	/* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
				4092	uint8_t siBytes[2] = {0, 0};
				4093	uint8_t soBytes[2] = {0, 0};
				4094	uint8_t siLength, soLength;
				4095	int32_t length = 0, prevLength;
				4096	uint8_t unicodeMask;
				4097
				4098	cnv=pArgs->converter;
				4099
				4100	if(cnv->preFromUFirstCP>=0) {
				4101	/*
				4102	* pass sourceIndex=-1 because we continue from an earlier buffer
				4103	* in the future, this may change with continuous offsets
				4104	*/
				4105	ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
				4106
				4107	if(U_FAILURE(*pErrorCode) \|\| cnv->preFromULength<0) {
				4108	return;
				4109	}
				4110	}
				4111
				4112	/* use optimized function if possible */
				4113	outputType=cnv->sharedData->mbcs.outputType;
				4114	unicodeMask=cnv->sharedData->mbcs.unicodeMask;
				4115	if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
				4116	if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
				4117	ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
				4118	} else {
				4119	ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
				4120	}
				4121	return;
				4122	} else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
				4123	ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
				4124	return;
				4125	}
				4126
				4127	/* set up the local pointers */
				4128	source=pArgs->source;
				4129	sourceLimit=pArgs->sourceLimit;
				4130	target=(uint8_t *)pArgs->target;
				4131	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
				4132	offsets=pArgs->offsets;
				4133
				4134	table=cnv->sharedData->mbcs.fromUnicodeTable;
				4135	if(cnv->sharedData->mbcs.utf8Friendly) {
				4136	mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
				4137	} else {
				4138	mbcsIndex=NULL;
				4139	}
				4140	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				4141	bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
				4142	} else {
				4143	bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
				4144	}
				4145	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
				4146
				4147	/* get the converter state from UConverter */
				4148	c=cnv->fromUChar32;
				4149
				4150	if(outputType==MBCS_OUTPUT_2_SISO) {
				4151	prevLength=cnv->fromUnicodeStatus;
				4152	if(prevLength==0) {
				4153	/* set the real value */
				4154	prevLength=1;
				4155	}
				4156	} else {
				4157	/* prevent fromUnicodeStatus from being set to something non-0 */
				4158	prevLength=0;
				4159	}
				4160
				4161	/* sourceIndex=-1 if the current character began in the previous buffer */
				4162	prevSourceIndex=-1;
				4163	sourceIndex= c==0 ? 0 : -1;
				4164	nextSourceIndex=0;
				4165
				4166	/* Get the SI/SO character for the converter */
				4167	siLength = getSISOBytes(SI, cnv->options, siBytes);
				4168	soLength = getSISOBytes(SO, cnv->options, soBytes);
				4169
				4170	/* conversion loop */
				4171	/*
				4172	* This is another piece of ugly code:
				4173	* A goto into the loop if the converter state contains a first surrogate
				4174	* from the previous function call.
				4175	* It saves me to check in each loop iteration a check of if(c==0)
				4176	* and duplicating the trail-surrogate-handling code in the else
				4177	* branch of that check.
				4178	* I could not find any other way to get around this other than
				4179	* using a function call for the conversion and callback, which would
				4180	* be even more inefficient.
				4181	*
				4182	* Markus Scherer 2000-jul-19
				4183	*/
				4184	if(c!=0 && targetCapacity>0) {
				4185	goto getTrail;
				4186	}
				4187
				4188	while(source<sourceLimit) {
				4189	/*
				4190	* This following test is to see if available input would overflow the output.
				4191	* It does not catch output of more than one byte that
				4192	* overflows as a result of a multi-byte character or callback output
				4193	* from the last source character.
				4194	* Therefore, those situations also test for overflows and will
				4195	* then break the loop, too.
				4196	*/
				4197	if(targetCapacity>0) {
				4198	/*
				4199	* Get a correct Unicode code point:
				4200	* a single UChar for a BMP code point or
				4201	* a matched surrogate pair for a "supplementary code point".
				4202	*/
				4203	c=*source++;
				4204	++nextSourceIndex;
				4205	if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
				4206	*target++=(uint8_t)c;
				4207	if(offsets!=NULL) {
				4208	*offsets++=sourceIndex;
				4209	prevSourceIndex=sourceIndex;
				4210	sourceIndex=nextSourceIndex;
				4211	}
				4212	--targetCapacity;
				4213	c=0;
				4214	continue;
				4215	}
				4216	/*
				4217	* utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
				4218	* to avoid dealing with surrogates.
				4219	* MBCS_FAST_MAX must be >=0xd7ff.
				4220	*/
				4221	if(c<=0xd7ff && mbcsIndex!=NULL) {
				4222	value=mbcsIndex[c>>6];
				4223
				4224	/* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
				4225	/* There are only roundtrips (!=0) and no-mapping (==0) entries. */
				4226	switch(outputType) {
				4227	case MBCS_OUTPUT_2:
				4228	value=((const uint16_t *)bytes)[value +(c&0x3f)];
				4229	if(value<=0xff) {
				4230	if(value==0) {
				4231	goto unassigned;
				4232	} else {
				4233	length=1;
				4234	}
				4235	} else {
				4236	length=2;
				4237	}
				4238	break;
				4239	case MBCS_OUTPUT_2_SISO:
				4240	/* 1/2-byte stateful with Shift-In/Shift-Out */
				4241	/*
				4242	* Save the old state in the converter object
				4243	* right here, then change the local prevLength state variable if necessary.
				4244	* Then, if this character turns out to be unassigned or a fallback that
				4245	* is not taken, the callback code must not save the new state in the converter
				4246	* because the new state is for a character that is not output.
				4247	* However, the callback must still restore the state from the converter
				4248	* in case the callback function changed it for its output.
				4249	*/
				4250	cnv->fromUnicodeStatus=prevLength; /* save the old state */
				4251	value=((const uint16_t *)bytes)[value +(c&0x3f)];
				4252	if(value<=0xff) {
				4253	if(value==0) {
				4254	goto unassigned;
				4255	} else if(prevLength<=1) {
				4256	length=1;
				4257	} else {
				4258	/* change from double-byte mode to single-byte */
				4259	if (siLength == 1) {
				4260	value\|=(uint32_t)siBytes[0]<<8;
				4261	length = 2;
				4262	} else if (siLength == 2) {
				4263	value\|=(uint32_t)siBytes[1]<<8;
				4264	value\|=(uint32_t)siBytes[0]<<16;
				4265	length = 3;
				4266	}
				4267	prevLength=1;
				4268	}
				4269	} else {
				4270	if(prevLength==2) {
				4271	length=2;
				4272	} else {
				4273	/* change from single-byte mode to double-byte */
				4274	if (soLength == 1) {
				4275	value\|=(uint32_t)soBytes[0]<<16;
				4276	length = 3;
				4277	} else if (soLength == 2) {
				4278	value\|=(uint32_t)soBytes[1]<<16;
				4279	value\|=(uint32_t)soBytes[0]<<24;
				4280	length = 4;
				4281	}
				4282	prevLength=2;
				4283	}
				4284	}
				4285	break;
				4286	case MBCS_OUTPUT_DBCS_ONLY:
				4287	/* table with single-byte results, but only DBCS mappings used */
				4288	value=((const uint16_t *)bytes)[value +(c&0x3f)];
				4289	if(value<=0xff) {
				4290	/* no mapping or SBCS result, not taken for DBCS-only */
				4291	goto unassigned;
				4292	} else {
				4293	length=2;
				4294	}
				4295	break;
				4296	case MBCS_OUTPUT_3:
				4297	p=bytes+(value+(c&0x3f))*3;
				4298	value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];
				4299	if(value<=0xff) {
				4300	if(value==0) {
				4301	goto unassigned;
				4302	} else {
				4303	length=1;
				4304	}
				4305	} else if(value<=0xffff) {
				4306	length=2;
				4307	} else {
				4308	length=3;
				4309	}
				4310	break;
				4311	case MBCS_OUTPUT_4:
				4312	value=((const uint32_t *)bytes)[value +(c&0x3f)];
				4313	if(value<=0xff) {
				4314	if(value==0) {
				4315	goto unassigned;
				4316	} else {
				4317	length=1;
				4318	}
				4319	} else if(value<=0xffff) {
				4320	length=2;
				4321	} else if(value<=0xffffff) {
				4322	length=3;
				4323	} else {
				4324	length=4;
				4325	}
				4326	break;
				4327	case MBCS_OUTPUT_3_EUC:
				4328	value=((const uint16_t *)bytes)[value +(c&0x3f)];
				4329	/* EUC 16-bit fixed-length representation */
				4330	if(value<=0xff) {
				4331	if(value==0) {
				4332	goto unassigned;
				4333	} else {
				4334	length=1;
				4335	}
				4336	} else if((value&0x8000)==0) {
				4337	value\|=0x8e8000;
				4338	length=3;
				4339	} else if((value&0x80)==0) {
				4340	value\|=0x8f0080;
				4341	length=3;
				4342	} else {
				4343	length=2;
				4344	}
				4345	break;
				4346	case MBCS_OUTPUT_4_EUC:
				4347	p=bytes+(value+(c&0x3f))*3;
				4348	value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];
				4349	/* EUC 16-bit fixed-length representation applied to the first two bytes */
				4350	if(value<=0xff) {
				4351	if(value==0) {
				4352	goto unassigned;
				4353	} else {
				4354	length=1;
				4355	}
				4356	} else if(value<=0xffff) {
				4357	length=2;
				4358	} else if((value&0x800000)==0) {
				4359	value\|=0x8e800000;
				4360	length=4;
				4361	} else if((value&0x8000)==0) {
				4362	value\|=0x8f008000;
				4363	length=4;
				4364	} else {
				4365	length=3;
				4366	}
				4367	break;
				4368	default:
				4369	/* must not occur */
				4370	/*
				4371	* To avoid compiler warnings that value & length may be
				4372	* used without having been initialized, we set them here.
				4373	* In reality, this is unreachable code.
				4374	* Not having a default branch also causes warnings with
				4375	* some compilers.
				4376	*/
				4377	value=0;
				4378	length=0;
				4379	break;
				4380	}
				4381	/* output the value */
				4382	} else {
				4383	/*
				4384	* This also tests if the codepage maps single surrogates.
				4385	* If it does, then surrogates are not paired but mapped separately.
				4386	* Note that in this case unmatched surrogates are not detected.
				4387	*/
				4388	if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
				4389	if(U16_IS_SURROGATE_LEAD(c)) {
				4390	getTrail:
				4391	if(source<sourceLimit) {
				4392	/* test the following code unit */
				4393	UChar trail=*source;
				4394	if(U16_IS_TRAIL(trail)) {
				4395	++source;
				4396	++nextSourceIndex;
				4397	c=U16_GET_SUPPLEMENTARY(c, trail);
				4398	if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
				4399	/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
				4400	cnv->fromUnicodeStatus=prevLength; /* save the old state */
				4401	/* callback(unassigned) */
				4402	goto unassigned;
				4403	}
				4404	/* convert this supplementary code point */
				4405	/* exit this condition tree */
				4406	} else {
				4407	/* this is an unmatched lead code unit (1st surrogate) */
				4408	/* callback(illegal) */
				4409	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				4410	break;
				4411	}
				4412	} else {
				4413	/* no more input */
				4414	break;
				4415	}
				4416	} else {
				4417	/* this is an unmatched trail code unit (2nd surrogate) */
				4418	/* callback(illegal) */
				4419	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				4420	break;
				4421	}
				4422	}
				4423
				4424	/* convert the Unicode code point in c into codepage bytes */
				4425
				4426	/*
				4427	* The basic lookup is a triple-stage compact array (trie) lookup.
				4428	* For details see the beginning of this file.
				4429	*
				4430	* Single-byte codepages are handled with a different data structure
				4431	* by _MBCSSingle... functions.
				4432	*
				4433	* The result consists of a 32-bit value from stage 2 and
				4434	* a pointer to as many bytes as are stored per character.
				4435	* The pointer points to the character's bytes in stage 3.
				4436	* Bits 15..0 of the stage 2 entry contain the stage 3 index
				4437	* for that pointer, while bits 31..16 are flags for which of
				4438	* the 16 characters in the block are roundtrip-assigned.
				4439	*
				4440	* For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
				4441	* respectively as uint32_t, in the platform encoding.
				4442	* For 3-byte codepages, the bytes are always stored in big-endian order.
				4443	*
				4444	* For EUC encodings that use only either 0x8e or 0x8f as the first
				4445	* byte of their longest byte sequences, the first two bytes in
				4446	* this third stage indicate with their 7th bits whether these bytes
				4447	* are to be written directly or actually need to be preceeded by
				4448	* one of the two Single-Shift codes. With this, the third stage
				4449	* stores one byte fewer per character than the actual maximum length of
				4450	* EUC byte sequences.
				4451	*
				4452	* Other than that, leading zero bytes are removed and the other
				4453	* bytes output. A single zero byte may be output if the "assigned"
				4454	* bit in stage 2 was on.
				4455	* The data structure does not support zero byte output as a fallback,
				4456	* and also does not allow output of leading zeros.
				4457	*/
				4458	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
				4459
				4460	/* get the bytes and the length for the output */
				4461	switch(outputType) {
				4462	case MBCS_OUTPUT_2:
				4463	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
				4464	if(value<=0xff) {
				4465	length=1;
				4466	} else {
				4467	length=2;
				4468	}
				4469	break;
				4470	case MBCS_OUTPUT_2_SISO:
				4471	/* 1/2-byte stateful with Shift-In/Shift-Out */
				4472	/*
				4473	* Save the old state in the converter object
				4474	* right here, then change the local prevLength state variable if necessary.
				4475	* Then, if this character turns out to be unassigned or a fallback that
				4476	* is not taken, the callback code must not save the new state in the converter
				4477	* because the new state is for a character that is not output.
				4478	* However, the callback must still restore the state from the converter
				4479	* in case the callback function changed it for its output.
				4480	*/
				4481	cnv->fromUnicodeStatus=prevLength; /* save the old state */
				4482	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
				4483	if(value<=0xff) {
				4484	if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
				4485	/* no mapping, leave value==0 */
				4486	length=0;
				4487	} else if(prevLength<=1) {
				4488	length=1;
				4489	} else {
				4490	/* change from double-byte mode to single-byte */
				4491	if (siLength == 1) {
				4492	value\|=(uint32_t)siBytes[0]<<8;
				4493	length = 2;
				4494	} else if (siLength == 2) {
				4495	value\|=(uint32_t)siBytes[1]<<8;
				4496	value\|=(uint32_t)siBytes[0]<<16;
				4497	length = 3;
				4498	}
				4499	prevLength=1;
				4500	}
				4501	} else {
				4502	if(prevLength==2) {
				4503	length=2;
				4504	} else {
				4505	/* change from single-byte mode to double-byte */
				4506	if (soLength == 1) {
				4507	value\|=(uint32_t)soBytes[0]<<16;
				4508	length = 3;
				4509	} else if (soLength == 2) {
				4510	value\|=(uint32_t)soBytes[1]<<16;
				4511	value\|=(uint32_t)soBytes[0]<<24;
				4512	length = 4;
				4513	}
				4514	prevLength=2;
				4515	}
				4516	}
				4517	break;
				4518	case MBCS_OUTPUT_DBCS_ONLY:
				4519	/* table with single-byte results, but only DBCS mappings used */
				4520	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
				4521	if(value<=0xff) {
				4522	/* no mapping or SBCS result, not taken for DBCS-only */
				4523	value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
				4524	length=0;
				4525	} else {
				4526	length=2;
				4527	}
				4528	break;
				4529	case MBCS_OUTPUT_3:
				4530	p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
				4531	value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];
				4532	if(value<=0xff) {
				4533	length=1;
				4534	} else if(value<=0xffff) {
				4535	length=2;
				4536	} else {
				4537	length=3;
				4538	}
				4539	break;
				4540	case MBCS_OUTPUT_4:
				4541	value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
				4542	if(value<=0xff) {
				4543	length=1;
				4544	} else if(value<=0xffff) {
				4545	length=2;
				4546	} else if(value<=0xffffff) {
				4547	length=3;
				4548	} else {
				4549	length=4;
				4550	}
				4551	break;
				4552	case MBCS_OUTPUT_3_EUC:
				4553	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
				4554	/* EUC 16-bit fixed-length representation */
				4555	if(value<=0xff) {
				4556	length=1;
				4557	} else if((value&0x8000)==0) {
				4558	value\|=0x8e8000;
				4559	length=3;
				4560	} else if((value&0x80)==0) {
				4561	value\|=0x8f0080;
				4562	length=3;
				4563	} else {
				4564	length=2;
				4565	}
				4566	break;
				4567	case MBCS_OUTPUT_4_EUC:
				4568	p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
				4569	value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];
				4570	/* EUC 16-bit fixed-length representation applied to the first two bytes */
				4571	if(value<=0xff) {
				4572	length=1;
				4573	} else if(value<=0xffff) {
				4574	length=2;
				4575	} else if((value&0x800000)==0) {
				4576	value\|=0x8e800000;
				4577	length=4;
				4578	} else if((value&0x8000)==0) {
				4579	value\|=0x8f008000;
				4580	length=4;
				4581	} else {
				4582	length=3;
				4583	}
				4584	break;
				4585	default:
				4586	/* must not occur */
				4587	/*
				4588	* To avoid compiler warnings that value & length may be
				4589	* used without having been initialized, we set them here.
				4590	* In reality, this is unreachable code.
				4591	* Not having a default branch also causes warnings with
				4592	* some compilers.
				4593	*/
				4594	value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
				4595	length=0;
				4596	break;
				4597	}
				4598
				4599	/* is this code point assigned, or do we use fallbacks? */
				4600	if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 \|\|
				4601	(UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
				4602	) {
				4603	/*
				4604	* We allow a 0 byte output if the "assigned" bit is set for this entry.
				4605	* There is no way with this data structure for fallback output
				4606	* to be a zero byte.
				4607	*/
				4608
				4609	unassigned:
				4610	/* try an extension mapping */
				4611	pArgs->source=source;
				4612	c=_extFromU(cnv, cnv->sharedData,
				4613	c, &source, sourceLimit,
				4614	&target, target+targetCapacity,
				4615	&offsets, sourceIndex,
				4616	pArgs->flush,
				4617	pErrorCode);
				4618	nextSourceIndex+=(int32_t)(source-pArgs->source);
				4619	prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
				4620
				4621	if(U_FAILURE(*pErrorCode)) {
				4622	/* not mappable or buffer overflow */
				4623	break;
				4624	} else {
				4625	/* a mapping was written to the target, continue */
				4626
				4627	/* recalculate the targetCapacity after an extension mapping */
				4628	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
				4629
				4630	/* normal end of conversion: prepare for a new character */
				4631	if(offsets!=NULL) {
				4632	prevSourceIndex=sourceIndex;
				4633	sourceIndex=nextSourceIndex;
				4634	}
				4635	continue;
				4636	}
				4637	}
				4638	}
				4639
				4640	/* write the output character bytes from value and length */
				4641	/* from the first if in the loop we know that targetCapacity>0 */
				4642	if(length<=targetCapacity) {
				4643	if(offsets==NULL) {
				4644	switch(length) {
				4645	/* each branch falls through to the next one */
				4646	case 4:
				4647	*target++=(uint8_t)(value>>24);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4648	U_FALLTHROUGH;
				4649	case 3:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4650	*target++=(uint8_t)(value>>16);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4651	U_FALLTHROUGH;
				4652	case 2:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4653	*target++=(uint8_t)(value>>8);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4654	U_FALLTHROUGH;
				4655	case 1:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4656	*target++=(uint8_t)value;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4657	U_FALLTHROUGH;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4658	default:
				4659	/* will never occur */
				4660	break;
				4661	}
				4662	} else {
				4663	switch(length) {
				4664	/* each branch falls through to the next one */
				4665	case 4:
				4666	*target++=(uint8_t)(value>>24);
				4667	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4668	U_FALLTHROUGH;
				4669	case 3:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4670	*target++=(uint8_t)(value>>16);
				4671	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4672	U_FALLTHROUGH;
				4673	case 2:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4674	*target++=(uint8_t)(value>>8);
				4675	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4676	U_FALLTHROUGH;
				4677	case 1:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4678	*target++=(uint8_t)value;
				4679	*offsets++=sourceIndex;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4680	U_FALLTHROUGH;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4681	default:
				4682	/* will never occur */
				4683	break;
				4684	}
				4685	}
				4686	targetCapacity-=length;
				4687	} else {
				4688	uint8_t *charErrorBuffer;
				4689
				4690	/*
				4691	* We actually do this backwards here:
				4692	* In order to save an intermediate variable, we output
				4693	* first to the overflow buffer what does not fit into the
				4694	* regular target.
				4695	*/
				4696	/* we know that 1<=targetCapacity<length<=4 */
				4697	length-=targetCapacity;
				4698	charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
				4699	switch(length) {
				4700	/* each branch falls through to the next one */
				4701	case 3:
				4702	*charErrorBuffer++=(uint8_t)(value>>16);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4703	U_FALLTHROUGH;
				4704	case 2:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4705	*charErrorBuffer++=(uint8_t)(value>>8);
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4706	U_FALLTHROUGH;
				4707	case 1:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4708	*charErrorBuffer=(uint8_t)value;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4709	U_FALLTHROUGH;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4710	default:
				4711	/* will never occur */
				4712	break;
				4713	}
				4714	cnv->charErrorBufferLength=(int8_t)length;
				4715
				4716	/* now output what fits into the regular target */
				4717	value>>=8length; / length was reduced by targetCapacity */
				4718	switch(targetCapacity) {
				4719	/* each branch falls through to the next one */
				4720	case 3:
				4721	*target++=(uint8_t)(value>>16);
				4722	if(offsets!=NULL) {
				4723	*offsets++=sourceIndex;
				4724	}
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4725	U_FALLTHROUGH;
				4726	case 2:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4727	*target++=(uint8_t)(value>>8);
				4728	if(offsets!=NULL) {
				4729	*offsets++=sourceIndex;
				4730	}
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4731	U_FALLTHROUGH;
				4732	case 1:
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4733	*target++=(uint8_t)value;
				4734	if(offsets!=NULL) {
				4735	*offsets++=sourceIndex;
				4736	}
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	4737	U_FALLTHROUGH;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	4738	default:
				4739	/* will never occur */
				4740	break;
				4741	}
				4742
				4743	/* target overflow */
				4744	targetCapacity=0;
				4745	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				4746	c=0;
				4747	break;
				4748	}
				4749
				4750	/* normal end of conversion: prepare for a new character */
				4751	c=0;
				4752	if(offsets!=NULL) {
				4753	prevSourceIndex=sourceIndex;
				4754	sourceIndex=nextSourceIndex;
				4755	}
				4756	continue;
				4757	} else {
				4758	/* target is full */
				4759	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				4760	break;
				4761	}
				4762	}
				4763
				4764	/*
				4765	* the end of the input stream and detection of truncated input
				4766	* are handled by the framework, but for EBCDIC_STATEFUL conversion
				4767	* we need to emit an SI at the very end
				4768	*
				4769	* conditions:
				4770	* successful
				4771	* EBCDIC_STATEFUL in DBCS mode
				4772	* end of input and no truncated input
				4773	*/
				4774	if( U_SUCCESS(*pErrorCode) &&
				4775	outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
				4776	pArgs->flush && source>=sourceLimit && c==0
				4777	) {
				4778	/* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
				4779	if(targetCapacity>0) {
				4780	*target++=(uint8_t)siBytes[0];
				4781	if (siLength == 2) {
				4782	if (targetCapacity<2) {
				4783	cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
				4784	cnv->charErrorBufferLength=1;
				4785	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				4786	} else {
				4787	*target++=(uint8_t)siBytes[1];
				4788	}
				4789	}
				4790	if(offsets!=NULL) {
				4791	/* set the last source character's index (sourceIndex points at sourceLimit now) */
				4792	*offsets++=prevSourceIndex;
				4793	}
				4794	} else {
				4795	/* target is full */
				4796	cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
				4797	if (siLength == 2) {
				4798	cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
				4799	}
				4800	cnv->charErrorBufferLength=siLength;
				4801	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				4802	}
				4803	prevLength=1; /* we switched into SBCS */
				4804	}
				4805
				4806	/* set the converter state back into UConverter */
				4807	cnv->fromUChar32=c;
				4808	cnv->fromUnicodeStatus=prevLength;
				4809
				4810	/* write back the updated pointers */
				4811	pArgs->source=source;
				4812	pArgs->target=(char *)target;
				4813	pArgs->offsets=offsets;
				4814	}
				4815
				4816	/*
				4817	* This is another simple conversion function for internal use by other
				4818	* conversion implementations.
				4819	* It does not use the converter state nor call callbacks.
				4820	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
				4821	* It handles conversion extensions but not GB 18030.
				4822	*
				4823	* It converts one single Unicode code point into codepage bytes, encoded
				4824	* as one 32-bit value. The function returns the number of bytes in *pValue:
				4825	* 1..4 the number of bytes in *pValue
				4826	* 0 unassigned (*pValue undefined)
				4827	* -1 illegal (currently not used, *pValue undefined)
				4828	*
				4829	* *pValue will contain the resulting bytes with the last byte in bits 7..0,
				4830	* the second to last byte in bits 15..8, etc.
				4831	* Currently, the function assumes but does not check that 0<=c<=0x10ffff.
				4832	*/
				4833	U_CFUNC int32_t
				4834	ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
				4835	UChar32 c, uint32_t *pValue,
				4836	UBool useFallback) {
				4837	const int32_t *cx;
				4838	const uint16_t *table;
				4839	#if 0
				4840	/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
				4841	const uint8_t *p;
				4842	#endif
				4843	uint32_t stage2Entry;
				4844	uint32_t value;
				4845	int32_t length;
				4846
				4847	/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
				4848	if(c<=0xffff \|\| (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
				4849	table=sharedData->mbcs.fromUnicodeTable;
				4850
				4851	/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
				4852	if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
				4853	value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
				4854	/* is this code point assigned, or do we use fallbacks? */
				4855	if(useFallback ? value>=0x800 : value>=0xc00) {
				4856	*pValue=value&0xff;
				4857	return 1;
				4858	}
				4859	} else /* outputType!=MBCS_OUTPUT_1 */ {
				4860	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
				4861
				4862	/* get the bytes and the length for the output */
				4863	switch(sharedData->mbcs.outputType) {
				4864	case MBCS_OUTPUT_2:
				4865	value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
				4866	if(value<=0xff) {
				4867	length=1;
				4868	} else {
				4869	length=2;
				4870	}
				4871	break;
				4872	#if 0
				4873	/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
				4874	case MBCS_OUTPUT_DBCS_ONLY:
				4875	/* table with single-byte results, but only DBCS mappings used */
				4876	value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
				4877	if(value<=0xff) {
				4878	/* no mapping or SBCS result, not taken for DBCS-only */
				4879	value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
				4880	length=0;
				4881	} else {
				4882	length=2;
				4883	}
				4884	break;
				4885	case MBCS_OUTPUT_3:
				4886	p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
				4887	value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];
				4888	if(value<=0xff) {
				4889	length=1;
				4890	} else if(value<=0xffff) {
				4891	length=2;
				4892	} else {
				4893	length=3;
				4894	}
				4895	break;
				4896	case MBCS_OUTPUT_4:
				4897	value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
				4898	if(value<=0xff) {
				4899	length=1;
				4900	} else if(value<=0xffff) {
				4901	length=2;
				4902	} else if(value<=0xffffff) {
				4903	length=3;
				4904	} else {
				4905	length=4;
				4906	}
				4907	break;
				4908	case MBCS_OUTPUT_3_EUC:
				4909	value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
				4910	/* EUC 16-bit fixed-length representation */
				4911	if(value<=0xff) {
				4912	length=1;
				4913	} else if((value&0x8000)==0) {
				4914	value\|=0x8e8000;
				4915	length=3;
				4916	} else if((value&0x80)==0) {
				4917	value\|=0x8f0080;
				4918	length=3;
				4919	} else {
				4920	length=2;
				4921	}
				4922	break;
				4923	case MBCS_OUTPUT_4_EUC:
				4924	p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
				4925	value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];
				4926	/* EUC 16-bit fixed-length representation applied to the first two bytes */
				4927	if(value<=0xff) {
				4928	length=1;
				4929	} else if(value<=0xffff) {
				4930	length=2;
				4931	} else if((value&0x800000)==0) {
				4932	value\|=0x8e800000;
				4933	length=4;
				4934	} else if((value&0x8000)==0) {
				4935	value\|=0x8f008000;
				4936	length=4;
				4937	} else {
				4938	length=3;
				4939	}
				4940	break;
				4941	#endif
				4942	default:
				4943	/* must not occur */
				4944	return -1;
				4945	}
				4946
				4947	/* is this code point assigned, or do we use fallbacks? */
				4948	if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|
				4949	(FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
				4950	) {
				4951	/*
				4952	* We allow a 0 byte output if the "assigned" bit is set for this entry.
				4953	* There is no way with this data structure for fallback output
				4954	* to be a zero byte.
				4955	*/
				4956	/* assigned */
				4957	*pValue=value;
				4958	return length;
				4959	}
				4960	}
				4961	}
				4962
				4963	cx=sharedData->mbcs.extIndexes;
				4964	if(cx!=NULL) {
				4965	length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
				4966	return length>=0 ? length : -length; /* return abs(length); */
				4967	}
				4968
				4969	/* unassigned */
				4970	return 0;
				4971	}
				4972
				4973
				4974	#if 0
				4975	/*
				4976	* This function has been moved to ucnv2022.c for inlining.
				4977	* This implementation is here only for documentation purposes
				4978	*/
				4979
				4980	/**
				4981	* This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
				4982	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
				4983	* It does not handle conversion extensions (_extFromU()).
				4984	*
				4985	* It returns the codepage byte for the code point, or -1 if it is unassigned.
				4986	*/
				4987	U_CFUNC int32_t
				4988	ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
				4989	UChar32 c,
				4990	UBool useFallback) {
				4991	const uint16_t *table;
				4992	int32_t value;
				4993
				4994	/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
				4995	if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
				4996	return -1;
				4997	}
				4998
				4999	/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
				5000	table=sharedData->mbcs.fromUnicodeTable;
				5001
				5002	/* get the byte for the output */
				5003	value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
				5004	/* is this code point assigned, or do we use fallbacks? */
				5005	if(useFallback ? value>=0x800 : value>=0xc00) {
				5006	return value&0xff;
				5007	} else {
				5008	return -1;
				5009	}
				5010	}
				5011	#endif
				5012
				5013	/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
				5014
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5015	/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
				5016	static const UChar32
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5017	utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5018
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	5019	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5020	ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
				5021	UConverterToUnicodeArgs *pToUArgs,
				5022	UErrorCode *pErrorCode) {
				5023	UConverter utf8, cnv;
				5024	const uint8_t source, sourceLimit;
				5025	uint8_t *target;
				5026	int32_t targetCapacity;
				5027
				5028	const uint16_t table, sbcsIndex;
				5029	const uint16_t *results;
				5030
				5031	int8_t oldToULength, toULength, toULimit;
				5032
				5033	UChar32 c;
				5034	uint8_t b, t1, t2;
				5035
				5036	uint32_t asciiRoundtrips;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5037	uint16_t value, minValue = 0;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5038	UBool hasSupplementary;
				5039
				5040	/* set up the local pointers */
				5041	utf8=pToUArgs->converter;
				5042	cnv=pFromUArgs->converter;
				5043	source=(uint8_t *)pToUArgs->source;
				5044	sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
				5045	target=(uint8_t *)pFromUArgs->target;
				5046	targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
				5047
				5048	table=cnv->sharedData->mbcs.fromUnicodeTable;
				5049	sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
				5050	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				5051	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
				5052	} else {
				5053	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
				5054	}
				5055	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
				5056
				5057	if(cnv->useFallback) {
				5058	/* use all roundtrip and fallback results */
				5059	minValue=0x800;
				5060	} else {
				5061	/* use only roundtrips and fallbacks from private-use characters */
				5062	minValue=0xc00;
				5063	}
				5064	hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
				5065
				5066	/* get the converter state from the UTF-8 UConverter */
				5067	c=(UChar32)utf8->toUnicodeStatus;
				5068	if(c!=0) {
				5069	toULength=oldToULength=utf8->toULength;
				5070	toULimit=(int8_t)utf8->mode;
				5071	} else {
				5072	toULength=oldToULength=toULimit=0;
				5073	}
				5074
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5075	// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
				5076	// If the buffer ends with a truncated 2- or 3-byte sequence,
				5077	// then we reduce the sourceLimit to before that,
				5078	// and collect the remaining bytes after the conversion loop.
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5079	{
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5080	// Do not go back into the bytes that will be read for finishing a partial
				5081	// sequence from the previous buffer.
				5082	int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
				5083	if(length>0) {
				5084	uint8_t b1=*(sourceLimit-1);
				5085	if(U8_IS_SINGLE(b1)) {
				5086	// common ASCII character
				5087	} else if(U8_IS_TRAIL(b1) && length>=2) {
				5088	uint8_t b2=*(sourceLimit-2);
				5089	if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
				5090	// truncated 3-byte sequence
				5091	sourceLimit-=2;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5092	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5093	} else if(0xc2<=b1 && b1<0xf0) {
				5094	// truncated 2- or 3-byte sequence
				5095	--sourceLimit;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5096	}
				5097	}
				5098	}
				5099
				5100	if(c!=0 && targetCapacity>0) {
				5101	utf8->toUnicodeStatus=0;
				5102	utf8->toULength=0;
				5103	goto moreBytes;
				5104	/*
				5105	* Note: We could avoid the goto by duplicating some of the moreBytes
				5106	* code, but only up to the point of collecting a complete UTF-8
				5107	* sequence; then recurse for the toUBytes[toULength]
				5108	* and then continue with normal conversion.
				5109	*
				5110	* If so, move this code to just after initializing the minimum
				5111	* set of local variables for reading the UTF-8 input
				5112	* (utf8, source, target, limits but not cnv, table, minValue, etc.).
				5113	*
				5114	* Potential advantages:
				5115	* - avoid the goto
				5116	* - oldToULength could become a local variable in just those code blocks
				5117	* that deal with buffer boundaries
				5118	* - possibly faster if the goto prevents some compiler optimizations
				5119	* (this would need measuring to confirm)
				5120	* Disadvantage:
				5121	* - code duplication
				5122	*/
				5123	}
				5124
				5125	/* conversion loop */
				5126	while(source<sourceLimit) {
				5127	if(targetCapacity>0) {
				5128	b=*source++;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5129	if(U8_IS_SINGLE(b)) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5130	/* convert ASCII */
				5131	if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
				5132	*target++=(uint8_t)b;
				5133	--targetCapacity;
				5134	continue;
				5135	} else {
				5136	c=b;
				5137	value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
				5138	}
				5139	} else {
				5140	if(b<0xe0) {
				5141	if( /* handle U+0080..U+07FF inline */
				5142	b>=0xc2 &&
				5143	(t1=(uint8_t)(*source-0x80)) <= 0x3f
				5144	) {
				5145	c=b&0x1f;
				5146	++source;
				5147	value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
				5148	if(value>=minValue) {
				5149	*target++=(uint8_t)value;
				5150	--targetCapacity;
				5151	continue;
				5152	} else {
				5153	c=(c<<6)\|t1;
				5154	}
				5155	} else {
				5156	c=-1;
				5157	}
				5158	} else if(b==0xe0) {
				5159	if( /* handle U+0800..U+0FFF inline */
				5160	(t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
				5161	(t2=(uint8_t)(source[1]-0x80)) <= 0x3f
				5162	) {
				5163	c=t1;
				5164	source+=2;
				5165	value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
				5166	if(value>=minValue) {
				5167	*target++=(uint8_t)value;
				5168	--targetCapacity;
				5169	continue;
				5170	} else {
				5171	c=(c<<6)\|t2;
				5172	}
				5173	} else {
				5174	c=-1;
				5175	}
				5176	} else {
				5177	c=-1;
				5178	}
				5179
				5180	if(c<0) {
				5181	/* handle "complicated" and error cases, and continuing partial characters */
				5182	oldToULength=0;
				5183	toULength=1;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5184	toULimit=U8_COUNT_BYTES_NON_ASCII(b);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5185	c=b;
				5186	moreBytes:
				5187	while(toULength<toULimit) {
				5188	/*
				5189	* The sourceLimit may have been adjusted before the conversion loop
				5190	* to stop before a truncated sequence.
				5191	* Here we need to use the real limit in case we have two truncated
				5192	* sequences at the end.
				5193	* See ticket #7492.
				5194	*/
				5195	if(source<(uint8_t *)pToUArgs->sourceLimit) {
				5196	b=*source;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5197	if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5198	++source;
				5199	++toULength;
				5200	c=(c<<6)+b;
				5201	} else {
				5202	break; /* sequence too short, stop with toULength<toULimit */
				5203	}
				5204	} else {
				5205	/* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
				5206	source-=(toULength-oldToULength);
				5207	while(oldToULength<toULength) {
				5208	utf8->toUBytes[oldToULength++]=*source++;
				5209	}
				5210	utf8->toUnicodeStatus=c;
				5211	utf8->toULength=toULength;
				5212	utf8->mode=toULimit;
				5213	pToUArgs->source=(char *)source;
				5214	pFromUArgs->target=(char *)target;
				5215	return;
				5216	}
				5217	}
				5218
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5219	if(toULength==toULimit) {
				5220	c-=utf8_offsets[toULength];
				5221	if(toULength<=3) { /* BMP */
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5222	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5223	} else {
				5224	/* supplementary code point */
				5225	if(!hasSupplementary) {
				5226	/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
				5227	value=0;
				5228	} else {
				5229	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
				5230	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5231	}
				5232	} else {
				5233	/* error handling: illegal UTF-8 byte sequence */
				5234	source-=(toULength-oldToULength);
				5235	while(oldToULength<toULength) {
				5236	utf8->toUBytes[oldToULength++]=*source++;
				5237	}
				5238	utf8->toULength=toULength;
				5239	pToUArgs->source=(char *)source;
				5240	pFromUArgs->target=(char *)target;
				5241	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				5242	return;
				5243	}
				5244	}
				5245	}
				5246
				5247	if(value>=minValue) {
				5248	/* output the mapping for c */
				5249	*target++=(uint8_t)value;
				5250	--targetCapacity;
				5251	} else {
				5252	/* value<minValue means c is unassigned (unmappable) */
				5253	/*
				5254	* Try an extension mapping.
				5255	* Pass in no source because we don't have UTF-16 input.
				5256	* If we have a partial match on c, we will return and revert
				5257	* to UTF-8->UTF-16->charset conversion.
				5258	*/
				5259	static const UChar nul=0;
				5260	const UChar *noSource=&nul;
				5261	c=_extFromU(cnv, cnv->sharedData,
				5262	c, &noSource, noSource,
				5263	&target, target+targetCapacity,
				5264	NULL, -1,
				5265	pFromUArgs->flush,
				5266	pErrorCode);
				5267
				5268	if(U_FAILURE(*pErrorCode)) {
				5269	/* not mappable or buffer overflow */
				5270	cnv->fromUChar32=c;
				5271	break;
				5272	} else if(cnv->preFromUFirstCP>=0) {
				5273	/*
				5274	* Partial match, return and revert to pivoting.
				5275	* In normal from-UTF-16 conversion, we would just continue
				5276	* but then exit the loop because the extension match would
				5277	* have consumed the source.
				5278	*/
				5279	*pErrorCode=U_USING_DEFAULT_WARNING;
				5280	break;
				5281	} else {
				5282	/* a mapping was written to the target, continue */
				5283
				5284	/* recalculate the targetCapacity after an extension mapping */
				5285	targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
				5286	}
				5287	}
				5288	} else {
				5289	/* target is full */
				5290	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				5291	break;
				5292	}
				5293	}
				5294
				5295	/*
				5296	* The sourceLimit may have been adjusted before the conversion loop
				5297	* to stop before a truncated sequence.
				5298	* If so, then collect the truncated sequence now.
				5299	*/
				5300	if(U_SUCCESS(*pErrorCode) &&
				5301	cnv->preFromUFirstCP<0 &&
				5302	source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
				5303	c=utf8->toUBytes[0]=b=*source++;
				5304	toULength=1;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5305	toULimit=U8_COUNT_BYTES(b);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5306	while(source<sourceLimit) {
				5307	utf8->toUBytes[toULength++]=b=*source++;
				5308	c=(c<<6)+b;
				5309	}
				5310	utf8->toUnicodeStatus=c;
				5311	utf8->toULength=toULength;
				5312	utf8->mode=toULimit;
				5313	}
				5314
				5315	/* write back the updated pointers */
				5316	pToUArgs->source=(char *)source;
				5317	pFromUArgs->target=(char *)target;
				5318	}
				5319
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	5320	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5321	ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
				5322	UConverterToUnicodeArgs *pToUArgs,
				5323	UErrorCode *pErrorCode) {
				5324	UConverter utf8, cnv;
				5325	const uint8_t source, sourceLimit;
				5326	uint8_t *target;
				5327	int32_t targetCapacity;
				5328
				5329	const uint16_t table, mbcsIndex;
				5330	const uint16_t *results;
				5331
				5332	int8_t oldToULength, toULength, toULimit;
				5333
				5334	UChar32 c;
				5335	uint8_t b, t1, t2;
				5336
				5337	uint32_t stage2Entry;
				5338	uint32_t asciiRoundtrips;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5339	uint16_t value = 0;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5340	UBool hasSupplementary;
				5341
				5342	/* set up the local pointers */
				5343	utf8=pToUArgs->converter;
				5344	cnv=pFromUArgs->converter;
				5345	source=(uint8_t *)pToUArgs->source;
				5346	sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
				5347	target=(uint8_t *)pFromUArgs->target;
				5348	targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
				5349
				5350	table=cnv->sharedData->mbcs.fromUnicodeTable;
				5351	mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
				5352	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
				5353	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
				5354	} else {
				5355	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
				5356	}
				5357	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
				5358
				5359	hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
				5360
				5361	/* get the converter state from the UTF-8 UConverter */
				5362	c=(UChar32)utf8->toUnicodeStatus;
				5363	if(c!=0) {
				5364	toULength=oldToULength=utf8->toULength;
				5365	toULimit=(int8_t)utf8->mode;
				5366	} else {
				5367	toULength=oldToULength=toULimit=0;
				5368	}
				5369
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5370	// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
				5371	// If the buffer ends with a truncated 2- or 3-byte sequence,
				5372	// then we reduce the sourceLimit to before that,
				5373	// and collect the remaining bytes after the conversion loop.
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5374	{
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5375	// Do not go back into the bytes that will be read for finishing a partial
				5376	// sequence from the previous buffer.
				5377	int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
				5378	if(length>0) {
				5379	uint8_t b1=*(sourceLimit-1);
				5380	if(U8_IS_SINGLE(b1)) {
				5381	// common ASCII character
				5382	} else if(U8_IS_TRAIL(b1) && length>=2) {
				5383	uint8_t b2=*(sourceLimit-2);
				5384	if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
				5385	// truncated 3-byte sequence
				5386	sourceLimit-=2;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5387	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5388	} else if(0xc2<=b1 && b1<0xf0) {
				5389	// truncated 2- or 3-byte sequence
				5390	--sourceLimit;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5391	}
				5392	}
				5393	}
				5394
				5395	if(c!=0 && targetCapacity>0) {
				5396	utf8->toUnicodeStatus=0;
				5397	utf8->toULength=0;
				5398	goto moreBytes;
				5399	/* See note in ucnv_SBCSFromUTF8() about this goto. */
				5400	}
				5401
				5402	/* conversion loop */
				5403	while(source<sourceLimit) {
				5404	if(targetCapacity>0) {
				5405	b=*source++;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5406	if(U8_IS_SINGLE(b)) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5407	/* convert ASCII */
				5408	if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
				5409	*target++=b;
				5410	--targetCapacity;
				5411	continue;
				5412	} else {
				5413	value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
				5414	if(value==0) {
				5415	c=b;
				5416	goto unassigned;
				5417	}
				5418	}
				5419	} else {
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5420	if(b>=0xe0) {
				5421	if( /* handle U+0800..U+D7FF inline */
				5422	b<=0xed && // do not assume maxFastUChar>0xd7ff
				5423	U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5424	(t2=(uint8_t)(source[1]-0x80)) <= 0x3f
				5425	) {
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5426	c=((b&0xf)<<6)\|(t1&0x3f);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5427	source+=2;
				5428	value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
				5429	if(value==0) {
				5430	c=(c<<6)\|t2;
				5431	goto unassigned;
				5432	}
				5433	} else {
				5434	c=-1;
				5435	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5436	} else {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5437	if( /* handle U+0080..U+07FF inline */
				5438	b>=0xc2 &&
				5439	(t1=(uint8_t)(*source-0x80)) <= 0x3f
				5440	) {
				5441	c=b&0x1f;
				5442	++source;
				5443	value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
				5444	if(value==0) {
				5445	c=(c<<6)\|t1;
				5446	goto unassigned;
				5447	}
				5448	} else {
				5449	c=-1;
				5450	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5451	}
				5452
				5453	if(c<0) {
				5454	/* handle "complicated" and error cases, and continuing partial characters */
				5455	oldToULength=0;
				5456	toULength=1;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5457	toULimit=U8_COUNT_BYTES_NON_ASCII(b);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5458	c=b;
				5459	moreBytes:
				5460	while(toULength<toULimit) {
				5461	/*
				5462	* The sourceLimit may have been adjusted before the conversion loop
				5463	* to stop before a truncated sequence.
				5464	* Here we need to use the real limit in case we have two truncated
				5465	* sequences at the end.
				5466	* See ticket #7492.
				5467	*/
				5468	if(source<(uint8_t *)pToUArgs->sourceLimit) {
				5469	b=*source;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5470	if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5471	++source;
				5472	++toULength;
				5473	c=(c<<6)+b;
				5474	} else {
				5475	break; /* sequence too short, stop with toULength<toULimit */
				5476	}
				5477	} else {
				5478	/* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
				5479	source-=(toULength-oldToULength);
				5480	while(oldToULength<toULength) {
				5481	utf8->toUBytes[oldToULength++]=*source++;
				5482	}
				5483	utf8->toUnicodeStatus=c;
				5484	utf8->toULength=toULength;
				5485	utf8->mode=toULimit;
				5486	pToUArgs->source=(char *)source;
				5487	pFromUArgs->target=(char *)target;
				5488	return;
				5489	}
				5490	}
				5491
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5492	if(toULength==toULimit) {
				5493	c-=utf8_offsets[toULength];
				5494	if(toULength<=3) { /* BMP */
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5495	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5496	} else {
				5497	/* supplementary code point */
				5498	if(!hasSupplementary) {
				5499	/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
				5500	stage2Entry=0;
				5501	} else {
				5502	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
				5503	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5504	}
				5505	} else {
				5506	/* error handling: illegal UTF-8 byte sequence */
				5507	source-=(toULength-oldToULength);
				5508	while(oldToULength<toULength) {
				5509	utf8->toUBytes[oldToULength++]=*source++;
				5510	}
				5511	utf8->toULength=toULength;
				5512	pToUArgs->source=(char *)source;
				5513	pFromUArgs->target=(char *)target;
				5514	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				5515	return;
				5516	}
				5517
				5518	/* get the bytes and the length for the output */
				5519	/* MBCS_OUTPUT_2 */
				5520	value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
				5521
				5522	/* is this code point assigned, or do we use fallbacks? */
				5523	if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|
				5524	(UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
				5525	) {
				5526	goto unassigned;
				5527	}
				5528	}
				5529	}
				5530
				5531	/* write the output character bytes from value and length */
				5532	/* from the first if in the loop we know that targetCapacity>0 */
				5533	if(value<=0xff) {
				5534	/* this is easy because we know that there is enough space */
				5535	*target++=(uint8_t)value;
				5536	--targetCapacity;
				5537	} else /* length==2 */ {
				5538	*target++=(uint8_t)(value>>8);
				5539	if(2<=targetCapacity) {
				5540	*target++=(uint8_t)value;
				5541	targetCapacity-=2;
				5542	} else {
				5543	cnv->charErrorBuffer[0]=(char)value;
				5544	cnv->charErrorBufferLength=1;
				5545
				5546	/* target overflow */
				5547	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				5548	break;
				5549	}
				5550	}
				5551	continue;
				5552
				5553	unassigned:
				5554	{
				5555	/*
				5556	* Try an extension mapping.
				5557	* Pass in no source because we don't have UTF-16 input.
				5558	* If we have a partial match on c, we will return and revert
				5559	* to UTF-8->UTF-16->charset conversion.
				5560	*/
				5561	static const UChar nul=0;
				5562	const UChar *noSource=&nul;
				5563	c=_extFromU(cnv, cnv->sharedData,
				5564	c, &noSource, noSource,
				5565	&target, target+targetCapacity,
				5566	NULL, -1,
				5567	pFromUArgs->flush,
				5568	pErrorCode);
				5569
				5570	if(U_FAILURE(*pErrorCode)) {
				5571	/* not mappable or buffer overflow */
				5572	cnv->fromUChar32=c;
				5573	break;
				5574	} else if(cnv->preFromUFirstCP>=0) {
				5575	/*
				5576	* Partial match, return and revert to pivoting.
				5577	* In normal from-UTF-16 conversion, we would just continue
				5578	* but then exit the loop because the extension match would
				5579	* have consumed the source.
				5580	*/
				5581	*pErrorCode=U_USING_DEFAULT_WARNING;
				5582	break;
				5583	} else {
				5584	/* a mapping was written to the target, continue */
				5585
				5586	/* recalculate the targetCapacity after an extension mapping */
				5587	targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
				5588	continue;
				5589	}
				5590	}
				5591	} else {
				5592	/* target is full */
				5593	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				5594	break;
				5595	}
				5596	}
				5597
				5598	/*
				5599	* The sourceLimit may have been adjusted before the conversion loop
				5600	* to stop before a truncated sequence.
				5601	* If so, then collect the truncated sequence now.
				5602	*/
				5603	if(U_SUCCESS(*pErrorCode) &&
				5604	cnv->preFromUFirstCP<0 &&
				5605	source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
				5606	c=utf8->toUBytes[0]=b=*source++;
				5607	toULength=1;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	5608	toULimit=U8_COUNT_BYTES(b);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5609	while(source<sourceLimit) {
				5610	utf8->toUBytes[toULength++]=b=*source++;
				5611	c=(c<<6)+b;
				5612	}
				5613	utf8->toUnicodeStatus=c;
				5614	utf8->toULength=toULength;
				5615	utf8->mode=toULimit;
				5616	}
				5617
				5618	/* write back the updated pointers */
				5619	pToUArgs->source=(char *)source;
				5620	pFromUArgs->target=(char *)target;
				5621	}
				5622
				5623	/* miscellaneous ------------------------------------------------------------ */
				5624
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	5625	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5626	ucnv_MBCSGetStarters(const UConverter* cnv,
				5627	UBool starters[256],
				5628	UErrorCode *) {
				5629	const int32_t *state0;
				5630	int i;
				5631
				5632	state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
				5633	for(i=0; i<256; ++i) {
				5634	/* all bytes that cause a state transition from state 0 are lead bytes */
				5635	starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
				5636	}
				5637	}
				5638
				5639	/*
				5640	* This is an internal function that allows other converter implementations
				5641	* to check whether a byte is a lead byte.
				5642	*/
				5643	U_CFUNC UBool
				5644	ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
				5645	return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
				5646	}
				5647
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	5648	static void U_CALLCONV
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	5649	ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
				5650	int32_t offsetIndex,
				5651	UErrorCode *pErrorCode) {
				5652	UConverter *cnv=pArgs->converter;
				5653	char p, subchar;
				5654	char buffer[4];
				5655	int32_t length;
				5656
				5657	/* first, select between subChar and subChar1 */
				5658	if( cnv->subChar1!=0 &&
				5659	(cnv->sharedData->mbcs.extIndexes!=NULL ?
				5660	cnv->useSubChar1 :
				5661	(cnv->invalidUCharBuffer[0]<=0xff))
				5662	) {
				5663	/* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
				5664	subchar=(char *)&cnv->subChar1;
				5665	length=1;
				5666	} else {
				5667	/* select subChar in all other cases */
				5668	subchar=(char *)cnv->subChars;
				5669	length=cnv->subCharLen;
				5670	}
				5671
				5672	/* reset the selector for the next code point */
				5673	cnv->useSubChar1=FALSE;
				5674
				5675	if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
				5676	p=buffer;
				5677
				5678	/* fromUnicodeStatus contains prevLength */
				5679	switch(length) {
				5680	case 1:
				5681	if(cnv->fromUnicodeStatus==2) {
				5682	/* DBCS mode and SBCS sub char: change to SBCS */
				5683	cnv->fromUnicodeStatus=1;
				5684	*p++=UCNV_SI;
				5685	}
				5686	*p++=subchar[0];
				5687	break;
				5688	case 2:
				5689	if(cnv->fromUnicodeStatus<=1) {
				5690	/* SBCS mode and DBCS sub char: change to DBCS */
				5691	cnv->fromUnicodeStatus=2;
				5692	*p++=UCNV_SO;
				5693	}
				5694	*p++=subchar[0];
				5695	*p++=subchar[1];
				5696	break;
				5697	default:
				5698	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				5699	return;
				5700	}
				5701	subchar=buffer;
				5702	length=(int32_t)(p-buffer);
				5703	}
				5704
				5705	ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
				5706	}
				5707
				5708	U_CFUNC UConverterType
				5709	ucnv_MBCSGetType(const UConverter* converter) {
				5710	/* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
				5711	if(converter->sharedData->mbcs.countStates==1) {
				5712	return (UConverterType)UCNV_SBCS;
				5713	} else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
				5714	return (UConverterType)UCNV_EBCDIC_STATEFUL;
				5715	} else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
				5716	return (UConverterType)UCNV_DBCS;
				5717	}
				5718	return (UConverterType)UCNV_MBCS;
				5719	}
				5720
				5721	#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */