Blame - source/common/ucnv_u7.cpp - chromium.googlesource.com/chromium/deps/icu

blob: de9f3f42ec9724e007709b3399cc66f4b3f4640f [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	**********************************************************************
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	5	* Copyright (C) 2002-2016, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	* file name: ucnv_u7.c
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	9	* encoding: UTF-8
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	10	* tab size: 8 (not used)
				11	* indentation:4
				12	*
				13	* created on: 2002jul01
				14	* created by: Markus W. Scherer
				15	*
				16	* UTF-7 converter implementation. Used to be in ucnv_utf.c.
				17	*/
				18
				19	#include "unicode/utypes.h"
				20
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame]	21	#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	22
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	23	#include "cmemory.h"
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	24	#include "unicode/ucnv.h"
				25	#include "ucnv_bld.h"
				26	#include "ucnv_cnv.h"
				27	#include "uassert.h"
				28
				29	/* UTF-7 -------------------------------------------------------------------- */
				30
				31	/*
				32	* UTF-7 is a stateful encoding of Unicode.
				33	* It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
				34	* It was intended for use in Internet email systems, using in its bytewise
				35	* encoding only a subset of 7-bit US-ASCII.
				36	* UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
				37	* occasionally used.
				38	*
				39	* For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
				40	* characters directly or in base64. Especially, the characters in set O
				41	* as defined in the RFC (see below) may be encoded directly but are not
				42	* allowed in, e.g., email headers.
				43	* By default, the ICU UTF-7 converter encodes set O directly.
				44	* By choosing the option "version=1", set O will be escaped instead.
				45	* For example:
				46	* utf7Converter=ucnv_open("UTF-7,version=1");
				47	*
				48	* For details about email headers see RFC 2047.
				49	*/
				50
				51	/*
				52	* Tests for US-ASCII characters belonging to character classes
				53	* defined in UTF-7.
				54	*
				55	* Set D (directly encoded characters) consists of the following
				56	* characters: the upper and lower case letters A through Z
				57	* and a through z, the 10 digits 0-9, and the following nine special
				58	* characters (note that "+" and "=" are omitted):
				59	* '(),-./:?
				60	*
				61	* Set O (optional direct characters) consists of the following
				62	* characters (note that "\" and "~" are omitted):
				63	* !"#$%&*;<=>@[]^_`{\|}
				64	*
				65	* According to the rules in RFC 2152, the byte values for the following
				66	* US-ASCII characters are not used in UTF-7 and are therefore illegal:
				67	* - all C0 control codes except for CR LF TAB
				68	* - BACKSLASH
				69	* - TILDE
				70	* - DEL
				71	* - all codes beyond US-ASCII, i.e. all >127
				72	*/
				73	#define inSetD(c) \
				74	((uint8_t)((c)-97)<26 \|\| (uint8_t)((c)-65)<26 \|\| /* letters */ \
				75	(uint8_t)((c)-48)<10 \|\| /* digits */ \
				76	(uint8_t)((c)-39)<3 \|\| /* '() */ \
				77	(uint8_t)((c)-44)<4 \|\| /* ,-./ */ \
				78	(c)==58 \|\| (c)==63 /* :? */ \
				79	)
				80
				81	#define inSetO(c) \
				82	((uint8_t)((c)-33)<6 \|\| /* !"#$%& */ \
				83	(uint8_t)((c)-59)<4 \|\| /* ;<=> */ \
				84	(uint8_t)((c)-93)<4 \|\| /* ]^_` */ \
				85	(uint8_t)((c)-123)<3 \|\| /* {\|} */ \
				86	(c)==42 \|\| (c)==64 \|\| (c)==91 /* @[ / \
				87	)
				88
				89	#define isCRLFTAB(c) ((c)==13 \|\| (c)==10 \|\| (c)==9)
				90	#define isCRLFSPTAB(c) ((c)==32 \|\| (c)==13 \|\| (c)==10 \|\| (c)==9)
				91
				92	#define PLUS 43
				93	#define MINUS 45
				94	#define BACKSLASH 92
				95	#define TILDE 126
				96
				97	/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
				98	#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) \|\| isCRLFTAB(c))
				99
				100	/* encode directly sets D and O and CR LF SP TAB */
				101	static const UBool encodeDirectlyMaximum[128]={
				102	/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
				103	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
				104	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				105
				106	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
				107	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				108
				109	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				110	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
				111
				112	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				113	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
				114	};
				115
				116	/* encode directly set D and CR LF SP TAB but not set O */
				117	static const UBool encodeDirectlyRestricted[128]={
				118	/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
				119	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
				120	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				121
				122	1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
				123	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
				124
				125	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				126	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
				127
				128	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				129	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
				130	};
				131
				132	static const uint8_t
				133	toBase64[64]={
				134	/* A-Z */
				135	65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
				136	78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
				137	/* a-z */
				138	97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
				139	110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
				140	/* 0-9 */
				141	48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
				142	/* +/ */
				143	43, 47
				144	};
				145
				146	static const int8_t
				147	fromBase64[128]={
				148	/* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
				149	-3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
				150	-3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
				151
				152	/* general punctuation with + and / and a special value (-2) for - */
				153	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
				154	/* digits */
				155	52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
				156
				157	/* A-Z */
				158	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
				159	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
				160
				161	/* a-z */
				162	-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
				163	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
				164	};
				165
				166	/*
				167	* converter status values:
				168	*
				169	* toUnicodeStatus:
				170	* 24 inDirectMode (boolean)
				171	* 23..16 base64Counter (-1..7)
				172	* 15..0 bits (up to 14 bits incoming base64)
				173	*
				174	* fromUnicodeStatus:
				175	* 31..28 version (0: set O direct 1: set O escaped)
				176	* 24 inDirectMode (boolean)
				177	* 23..16 base64Counter (0..2)
				178	* 7..0 bits (6 bits outgoing base64)
				179	*
				180	*/
				181
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	182	U_CDECL_BEGIN
				183	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	184	_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
				185	if(choice<=UCNV_RESET_TO_UNICODE) {
				186	/* reset toUnicode */
				187	cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
				188	cnv->toULength=0;
				189	}
				190	if(choice!=UCNV_RESET_TO_UNICODE) {
				191	/* reset fromUnicode */
				192	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)\|0x1000000; /* keep version, inDirectMode=TRUE */
				193	}
				194	}
				195
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	196	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	197	_UTF7Open(UConverter *cnv,
				198	UConverterLoadArgs *pArgs,
				199	UErrorCode *pErrorCode) {
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	200	(void)pArgs;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	201	if(UCNV_GET_VERSION(cnv)<=1) {
				202	/* TODO(markus): Should just use cnv->options rather than copying the version number. */
				203	cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
				204	_UTF7Reset(cnv, UCNV_RESET_BOTH);
				205	} else {
				206	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				207	}
				208	}
				209
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	210	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	211	_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				212	UErrorCode *pErrorCode) {
				213	UConverter *cnv;
				214	const uint8_t source, sourceLimit;
				215	UChar *target;
				216	const UChar *targetLimit;
				217	int32_t *offsets;
				218
				219	uint8_t *bytes;
				220	uint8_t byteIndex;
				221
				222	int32_t length, targetCapacity;
				223
				224	/* UTF-7 state */
				225	uint16_t bits;
				226	int8_t base64Counter;
				227	UBool inDirectMode;
				228
				229	int8_t base64Value;
				230
				231	int32_t sourceIndex, nextSourceIndex;
				232
				233	uint8_t b;
				234	/* set up the local pointers */
				235	cnv=pArgs->converter;
				236
				237	source=(const uint8_t *)pArgs->source;
				238	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				239	target=pArgs->target;
				240	targetLimit=pArgs->targetLimit;
				241	offsets=pArgs->offsets;
				242	/* get the state machine state */
				243	{
				244	uint32_t status=cnv->toUnicodeStatus;
				245	inDirectMode=(UBool)((status>>24)&1);
				246	base64Counter=(int8_t)(status>>16);
				247	bits=(uint16_t)status;
				248	}
				249	bytes=cnv->toUBytes;
				250	byteIndex=cnv->toULength;
				251
				252	/* sourceIndex=-1 if the current character began in the previous buffer */
				253	sourceIndex=byteIndex==0 ? 0 : -1;
				254	nextSourceIndex=0;
				255
				256	if(inDirectMode) {
				257	directMode:
				258	/*
				259	* In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
				260	* with their US-ASCII byte values.
				261	* Backslash and Tilde and most control characters are not allowed in UTF-7.
				262	* A plus sign starts Unicode (or "escape") Mode.
				263	*
				264	* In Direct Mode, only the sourceIndex is used.
				265	*/
				266	byteIndex=0;
				267	length=(int32_t)(sourceLimit-source);
				268	targetCapacity=(int32_t)(targetLimit-target);
				269	if(length>targetCapacity) {
				270	length=targetCapacity;
				271	}
				272	while(length>0) {
				273	b=*source++;
				274	if(!isLegalUTF7(b)) {
				275	/* illegal */
				276	bytes[0]=b;
				277	byteIndex=1;
				278	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				279	break;
				280	} else if(b!=PLUS) {
				281	/* write directly encoded character */
				282	*target++=b;
				283	if(offsets!=NULL) {
				284	*offsets++=sourceIndex++;
				285	}
				286	} else /* PLUS */ {
				287	/* switch to Unicode mode */
				288	nextSourceIndex=++sourceIndex;
				289	inDirectMode=FALSE;
				290	byteIndex=0;
				291	bits=0;
				292	base64Counter=-1;
				293	goto unicodeMode;
				294	}
				295	--length;
				296	}
				297	if(source<sourceLimit && target>=targetLimit) {
				298	/* target is full */
				299	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				300	}
				301	} else {
				302	unicodeMode:
				303	/*
				304	* In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
				305	* The base64 sequence ends with any character that is not in the base64 alphabet.
				306	* A terminating minus sign is consumed.
				307	*
				308	* In Unicode Mode, the sourceIndex has the index to the start of the current
				309	* base64 bytes, while nextSourceIndex is precisely parallel to source,
				310	* keeping the index to the following byte.
				311	* Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
				312	*/
				313	while(source<sourceLimit) {
				314	if(target<targetLimit) {
				315	bytes[byteIndex++]=b=*source++;
				316	++nextSourceIndex;
				317	base64Value = -3; /* initialize as illegal */
				318	if(b>=126 \|\| (base64Value=fromBase64[b])==-3 \|\| base64Value==-1) {
				319	/* either
				320	* base64Value==-1 for any legal character except base64 and minus sign, or
				321	* base64Value==-3 for illegal characters:
				322	* 1. In either case, leave Unicode mode.
				323	* 2.1. If we ended with an incomplete UChar or none after the +, then
				324	* generate an error for the preceding erroneous sequence and deal with
				325	* the current (possibly illegal) character next time through.
				326	* 2.2. Else the current char comes after a complete UChar, which was already
				327	* pushed to the output buf, so:
				328	* 2.2.1. If the current char is legal, just save it for processing next time.
				329	* It may be for example, a plus which we need to deal with in direct mode.
				330	* 2.2.2. Else if the current char is illegal, we might as well deal with it here.
				331	*/
				332	inDirectMode=TRUE;
				333	if(base64Counter==-1) {
				334	/* illegal: + immediately followed by something other than base64 or minus sign */
				335	/* include the plus sign in the reported sequence, but not the subsequent char */
				336	--source;
				337	bytes[0]=PLUS;
				338	byteIndex=1;
				339	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				340	break;
				341	} else if(bits!=0) {
				342	/* bits are illegally left over, a UChar is incomplete */
				343	/* don't include current char (legal or illegal) in error seq */
				344	--source;
				345	--byteIndex;
				346	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				347	break;
				348	} else {
				349	/* previous UChar was complete */
				350	if(base64Value==-3) {
				351	/* current character is illegal, deal with it here */
				352	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				353	break;
				354	} else {
				355	/* un-read the current character in case it is a plus sign */
				356	--source;
				357	sourceIndex=nextSourceIndex-1;
				358	goto directMode;
				359	}
				360	}
				361	} else if(base64Value>=0) {
				362	/* collect base64 bytes into UChars */
				363	switch(base64Counter) {
				364	case -1: /* -1 is immediately after the + */
				365	case 0:
				366	bits=base64Value;
				367	base64Counter=1;
				368	break;
				369	case 1:
				370	case 3:
				371	case 4:
				372	case 6:
				373	bits=(uint16_t)((bits<<6)\|base64Value);
				374	++base64Counter;
				375	break;
				376	case 2:
				377	*target++=(UChar)((bits<<4)\|(base64Value>>2));
				378	if(offsets!=NULL) {
				379	*offsets++=sourceIndex;
				380	sourceIndex=nextSourceIndex-1;
				381	}
				382	bytes[0]=b; /* keep this byte in case an error occurs */
				383	byteIndex=1;
				384	bits=(uint16_t)(base64Value&3);
				385	base64Counter=3;
				386	break;
				387	case 5:
				388	*target++=(UChar)((bits<<2)\|(base64Value>>4));
				389	if(offsets!=NULL) {
				390	*offsets++=sourceIndex;
				391	sourceIndex=nextSourceIndex-1;
				392	}
				393	bytes[0]=b; /* keep this byte in case an error occurs */
				394	byteIndex=1;
				395	bits=(uint16_t)(base64Value&15);
				396	base64Counter=6;
				397	break;
				398	case 7:
				399	*target++=(UChar)((bits<<6)\|base64Value);
				400	if(offsets!=NULL) {
				401	*offsets++=sourceIndex;
				402	sourceIndex=nextSourceIndex;
				403	}
				404	byteIndex=0;
				405	bits=0;
				406	base64Counter=0;
				407	break;
				408	default:
				409	/* will never occur */
				410	break;
				411	}
				412	} else /base64Value==-2/ {
				413	/* minus sign terminates the base64 sequence */
				414	inDirectMode=TRUE;
				415	if(base64Counter==-1) {
				416	/* +- i.e. a minus immediately following a plus */
				417	*target++=PLUS;
				418	if(offsets!=NULL) {
				419	*offsets++=sourceIndex-1;
				420	}
				421	} else {
				422	/* absorb the minus and leave the Unicode Mode */
				423	if(bits!=0) {
				424	/* bits are illegally left over, a UChar is incomplete */
				425	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				426	break;
				427	}
				428	}
				429	sourceIndex=nextSourceIndex;
				430	goto directMode;
				431	}
				432	} else {
				433	/* target is full */
				434	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				435	break;
				436	}
				437	}
				438	}
				439
				440	if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
				441	/*
				442	* if we are in Unicode mode, then the byteIndex might not be 0,
				443	* but that is ok if bits==0
				444	* -> we set byteIndex=0 at the end of the stream to avoid a truncated error
				445	* (not true for IMAP-mailbox-name where we must end in direct mode)
				446	*/
				447	byteIndex=0;
				448	}
				449
				450	/* set the converter state back into UConverter */
				451	cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)\|((uint32_t)((uint8_t)base64Counter)<<16)\|(uint32_t)bits;
				452	cnv->toULength=byteIndex;
				453
				454	/* write back the updated pointers */
				455	pArgs->source=(const char *)source;
				456	pArgs->target=target;
				457	pArgs->offsets=offsets;
				458	return;
				459	}
				460
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	461	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	462	_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				463	UErrorCode *pErrorCode) {
				464	UConverter *cnv;
				465	const UChar source, sourceLimit;
				466	uint8_t target, targetLimit;
				467	int32_t *offsets;
				468
				469	int32_t length, targetCapacity, sourceIndex;
				470	UChar c;
				471
				472	/* UTF-7 state */
				473	const UBool *encodeDirectly;
				474	uint8_t bits;
				475	int8_t base64Counter;
				476	UBool inDirectMode;
				477
				478	/* set up the local pointers */
				479	cnv=pArgs->converter;
				480
				481	/* set up the local pointers */
				482	source=pArgs->source;
				483	sourceLimit=pArgs->sourceLimit;
				484	target=(uint8_t *)pArgs->target;
				485	targetLimit=(uint8_t *)pArgs->targetLimit;
				486	offsets=pArgs->offsets;
				487
				488	/* get the state machine state */
				489	{
				490	uint32_t status=cnv->fromUnicodeStatus;
				491	encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
				492	inDirectMode=(UBool)((status>>24)&1);
				493	base64Counter=(int8_t)(status>>16);
				494	bits=(uint8_t)status;
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	495	U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	496	}
				497
				498	/* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
				499	sourceIndex=0;
				500
				501	if(inDirectMode) {
				502	directMode:
				503	length=(int32_t)(sourceLimit-source);
				504	targetCapacity=(int32_t)(targetLimit-target);
				505	if(length>targetCapacity) {
				506	length=targetCapacity;
				507	}
				508	while(length>0) {
				509	c=*source++;
				510	/* currently always encode CR LF SP TAB directly */
				511	if(c<=127 && encodeDirectly[c]) {
				512	/* encode directly */
				513	*target++=(uint8_t)c;
				514	if(offsets!=NULL) {
				515	*offsets++=sourceIndex++;
				516	}
				517	} else if(c==PLUS) {
				518	/* output +- for + */
				519	*target++=PLUS;
				520	if(target<targetLimit) {
				521	*target++=MINUS;
				522	if(offsets!=NULL) {
				523	*offsets++=sourceIndex;
				524	*offsets++=sourceIndex++;
				525	}
				526	/* realign length and targetCapacity */
				527	goto directMode;
				528	} else {
				529	if(offsets!=NULL) {
				530	*offsets++=sourceIndex++;
				531	}
				532	cnv->charErrorBuffer[0]=MINUS;
				533	cnv->charErrorBufferLength=1;
				534	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				535	break;
				536	}
				537	} else {
				538	/* un-read this character and switch to Unicode Mode */
				539	--source;
				540	*target++=PLUS;
				541	if(offsets!=NULL) {
				542	*offsets++=sourceIndex;
				543	}
				544	inDirectMode=FALSE;
				545	base64Counter=0;
				546	goto unicodeMode;
				547	}
				548	--length;
				549	}
				550	if(source<sourceLimit && target>=targetLimit) {
				551	/* target is full */
				552	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				553	}
				554	} else {
				555	unicodeMode:
				556	while(source<sourceLimit) {
				557	if(target<targetLimit) {
				558	c=*source++;
				559	if(c<=127 && encodeDirectly[c]) {
				560	/* encode directly */
				561	inDirectMode=TRUE;
				562
				563	/* trick: back out this character to make this easier */
				564	--source;
				565
				566	/* terminate the base64 sequence */
				567	if(base64Counter!=0) {
				568	/* write remaining bits for the previous character */
				569	*target++=toBase64[bits];
				570	if(offsets!=NULL) {
				571	*offsets++=sourceIndex-1;
				572	}
				573	}
				574	if(fromBase64[c]!=-1) {
				575	/* need to terminate with a minus */
				576	if(target<targetLimit) {
				577	*target++=MINUS;
				578	if(offsets!=NULL) {
				579	*offsets++=sourceIndex-1;
				580	}
				581	} else {
				582	cnv->charErrorBuffer[0]=MINUS;
				583	cnv->charErrorBufferLength=1;
				584	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				585	break;
				586	}
				587	}
				588	goto directMode;
				589	} else {
				590	/*
				591	* base64 this character:
				592	* Output 2 or 3 base64 bytes for the remaining bits of the previous character
				593	* and the bits of this character, each implicitly in UTF-16BE.
				594	*
				595	* Here, bits is an 8-bit variable because only 6 bits need to be kept from one
				596	* character to the next. The actual 2 or 4 bits are shifted to the left edge
				597	* of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
				598	*/
				599	switch(base64Counter) {
				600	case 0:
				601	*target++=toBase64[c>>10];
				602	if(target<targetLimit) {
				603	*target++=toBase64[(c>>4)&0x3f];
				604	if(offsets!=NULL) {
				605	*offsets++=sourceIndex;
				606	*offsets++=sourceIndex++;
				607	}
				608	} else {
				609	if(offsets!=NULL) {
				610	*offsets++=sourceIndex++;
				611	}
				612	cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
				613	cnv->charErrorBufferLength=1;
				614	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				615	}
				616	bits=(uint8_t)((c&15)<<2);
				617	base64Counter=1;
				618	break;
				619	case 1:
				620	*target++=toBase64[bits\|(c>>14)];
				621	if(target<targetLimit) {
				622	*target++=toBase64[(c>>8)&0x3f];
				623	if(target<targetLimit) {
				624	*target++=toBase64[(c>>2)&0x3f];
				625	if(offsets!=NULL) {
				626	*offsets++=sourceIndex;
				627	*offsets++=sourceIndex;
				628	*offsets++=sourceIndex++;
				629	}
				630	} else {
				631	if(offsets!=NULL) {
				632	*offsets++=sourceIndex;
				633	*offsets++=sourceIndex++;
				634	}
				635	cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
				636	cnv->charErrorBufferLength=1;
				637	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				638	}
				639	} else {
				640	if(offsets!=NULL) {
				641	*offsets++=sourceIndex++;
				642	}
				643	cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
				644	cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
				645	cnv->charErrorBufferLength=2;
				646	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				647	}
				648	bits=(uint8_t)((c&3)<<4);
				649	base64Counter=2;
				650	break;
				651	case 2:
				652	*target++=toBase64[bits\|(c>>12)];
				653	if(target<targetLimit) {
				654	*target++=toBase64[(c>>6)&0x3f];
				655	if(target<targetLimit) {
				656	*target++=toBase64[c&0x3f];
				657	if(offsets!=NULL) {
				658	*offsets++=sourceIndex;
				659	*offsets++=sourceIndex;
				660	*offsets++=sourceIndex++;
				661	}
				662	} else {
				663	if(offsets!=NULL) {
				664	*offsets++=sourceIndex;
				665	*offsets++=sourceIndex++;
				666	}
				667	cnv->charErrorBuffer[0]=toBase64[c&0x3f];
				668	cnv->charErrorBufferLength=1;
				669	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				670	}
				671	} else {
				672	if(offsets!=NULL) {
				673	*offsets++=sourceIndex++;
				674	}
				675	cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
				676	cnv->charErrorBuffer[1]=toBase64[c&0x3f];
				677	cnv->charErrorBufferLength=2;
				678	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				679	}
				680	bits=0;
				681	base64Counter=0;
				682	break;
				683	default:
				684	/* will never occur */
				685	break;
				686	}
				687	}
				688	} else {
				689	/* target is full */
				690	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				691	break;
				692	}
				693	}
				694	}
				695
				696	if(pArgs->flush && source>=sourceLimit) {
				697	/* flush remaining bits to the target */
				698	if(!inDirectMode) {
				699	if (base64Counter!=0) {
				700	if(target<targetLimit) {
				701	*target++=toBase64[bits];
				702	if(offsets!=NULL) {
				703	*offsets++=sourceIndex-1;
				704	}
				705	} else {
				706	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
				707	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				708	}
				709	}
				710	/* Add final MINUS to terminate unicodeMode */
				711	if(target<targetLimit) {
				712	*target++=MINUS;
				713	if(offsets!=NULL) {
				714	*offsets++=sourceIndex-1;
				715	}
				716	} else {
				717	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
				718	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				719	}
				720	}
				721	/* reset the state for the next conversion */
				722	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)\|0x1000000; /* keep version, inDirectMode=TRUE */
				723	} else {
				724	/* set the converter state back into UConverter */
				725	cnv->fromUnicodeStatus=
				726	(cnv->fromUnicodeStatus&0xf0000000)\| /* keep version*/
				727	((uint32_t)inDirectMode<<24)\|((uint32_t)base64Counter<<16)\|(uint32_t)bits;
				728	}
				729
				730	/* write back the updated pointers */
				731	pArgs->source=source;
				732	pArgs->target=(char *)target;
				733	pArgs->offsets=offsets;
				734	return;
				735	}
				736
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	737	static const char * U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	738	_UTF7GetName(const UConverter *cnv) {
				739	switch(cnv->fromUnicodeStatus>>28) {
				740	case 1:
				741	return "UTF-7,version=1";
				742	default:
				743	return "UTF-7";
				744	}
				745	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	746	U_CDECL_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	747
				748	static const UConverterImpl _UTF7Impl={
				749	UCNV_UTF7,
				750
				751	NULL,
				752	NULL,
				753
				754	_UTF7Open,
				755	NULL,
				756	_UTF7Reset,
				757
				758	_UTF7ToUnicodeWithOffsets,
				759	_UTF7ToUnicodeWithOffsets,
				760	_UTF7FromUnicodeWithOffsets,
				761	_UTF7FromUnicodeWithOffsets,
				762	NULL,
				763
				764	NULL,
				765	_UTF7GetName,
				766	NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
				767	NULL,
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	768	ucnv_getCompleteUnicodeSet,
				769
				770	NULL,
				771	NULL
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	772	};
				773
				774	static const UConverterStaticData _UTF7StaticData={
				775	sizeof(UConverterStaticData),
				776	"UTF-7",
				777	0, /* TODO CCSID for UTF-7 */
				778	UCNV_IBM, UCNV_UTF7,
				779	1, 4,
				780	{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
				781	FALSE, FALSE,
				782	0,
				783	0,
				784	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				785	};
				786
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	787	const UConverterSharedData _UTF7Data=
				788	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	789
				790	/* IMAP mailbox name encoding ----------------------------------------------- */
				791
				792	/*
				793	* RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
				794	* http://www.ietf.org/rfc/rfc2060.txt
				795	*
				796	* 5.1.3. Mailbox International Naming Convention
				797	*
				798	* By convention, international mailbox names are specified using a
				799	* modified version of the UTF-7 encoding described in [UTF-7]. The
				800	* purpose of these modifications is to correct the following problems
				801	* with UTF-7:
				802	*
				803	* 1) UTF-7 uses the "+" character for shifting; this conflicts with
				804	* the common use of "+" in mailbox names, in particular USENET
				805	* newsgroup names.
				806	*
				807	* 2) UTF-7's encoding is BASE64 which uses the "/" character; this
				808	* conflicts with the use of "/" as a popular hierarchy delimiter.
				809	*
				810	* 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
				811	* the use of "\" as a popular hierarchy delimiter.
				812	*
				813	* 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
				814	* the use of "~" in some servers as a home directory indicator.
				815	*
				816	* 5) UTF-7 permits multiple alternate forms to represent the same
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame^]	817	* string; in particular, printable US-ASCII characters can be
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	818	* represented in encoded form.
				819	*
				820	* In modified UTF-7, printable US-ASCII characters except for "&"
				821	* represent themselves; that is, characters with octet values 0x20-0x25
				822	* and 0x27-0x7e. The character "&" (0x26) is represented by the two-
				823	* octet sequence "&-".
				824	*
				825	* All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
				826	* Unicode 16-bit octets) are represented in modified BASE64, with a
				827	* further modification from [UTF-7] that "," is used instead of "/".
				828	* Modified BASE64 MUST NOT be used to represent any printing US-ASCII
				829	* character which can represent itself.
				830	*
				831	* "&" is used to shift to modified BASE64 and "-" to shift back to US-
				832	* ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
				833	* is, a name that ends with a Unicode 16-bit octet MUST end with a "-
				834	* ").
				835	*
				836	* For example, here is a mailbox name which mixes English, Japanese,
				837	* and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
				838	*/
				839
				840	/*
				841	* Tests for US-ASCII characters belonging to character classes
				842	* defined in UTF-7.
				843	*
				844	* Set D (directly encoded characters) consists of the following
				845	* characters: the upper and lower case letters A through Z
				846	* and a through z, the 10 digits 0-9, and the following nine special
				847	* characters (note that "+" and "=" are omitted):
				848	* '(),-./:?
				849	*
				850	* Set O (optional direct characters) consists of the following
				851	* characters (note that "\" and "~" are omitted):
				852	* !"#$%&*;<=>@[]^_`{\|}
				853	*
				854	* According to the rules in RFC 2152, the byte values for the following
				855	* US-ASCII characters are not used in UTF-7 and are therefore illegal:
				856	* - all C0 control codes except for CR LF TAB
				857	* - BACKSLASH
				858	* - TILDE
				859	* - DEL
				860	* - all codes beyond US-ASCII, i.e. all >127
				861	*/
				862
				863	/* uses '&' not '+' to start a base64 sequence */
				864	#define AMPERSAND 0x26
				865	#define COMMA 0x2c
				866	#define SLASH 0x2f
				867
				868	/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
				869	#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
				870
				871	/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
				872	#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
				873
				874	#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
				875	#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
				876
				877	/*
				878	* converter status values:
				879	*
				880	* toUnicodeStatus:
				881	* 24 inDirectMode (boolean)
				882	* 23..16 base64Counter (-1..7)
				883	* 15..0 bits (up to 14 bits incoming base64)
				884	*
				885	* fromUnicodeStatus:
				886	* 24 inDirectMode (boolean)
				887	* 23..16 base64Counter (0..2)
				888	* 7..0 bits (6 bits outgoing base64)
				889	*
				890	* ignore bits 31..25
				891	*/
				892
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	893	U_CDECL_BEGIN
				894	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	895	_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				896	UErrorCode *pErrorCode) {
				897	UConverter *cnv;
				898	const uint8_t source, sourceLimit;
				899	UChar *target;
				900	const UChar *targetLimit;
				901	int32_t *offsets;
				902
				903	uint8_t *bytes;
				904	uint8_t byteIndex;
				905
				906	int32_t length, targetCapacity;
				907
				908	/* UTF-7 state */
				909	uint16_t bits;
				910	int8_t base64Counter;
				911	UBool inDirectMode;
				912
				913	int8_t base64Value;
				914
				915	int32_t sourceIndex, nextSourceIndex;
				916
				917	UChar c;
				918	uint8_t b;
				919
				920	/* set up the local pointers */
				921	cnv=pArgs->converter;
				922
				923	source=(const uint8_t *)pArgs->source;
				924	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				925	target=pArgs->target;
				926	targetLimit=pArgs->targetLimit;
				927	offsets=pArgs->offsets;
				928	/* get the state machine state */
				929	{
				930	uint32_t status=cnv->toUnicodeStatus;
				931	inDirectMode=(UBool)((status>>24)&1);
				932	base64Counter=(int8_t)(status>>16);
				933	bits=(uint16_t)status;
				934	}
				935	bytes=cnv->toUBytes;
				936	byteIndex=cnv->toULength;
				937
				938	/* sourceIndex=-1 if the current character began in the previous buffer */
				939	sourceIndex=byteIndex==0 ? 0 : -1;
				940	nextSourceIndex=0;
				941
				942	if(inDirectMode) {
				943	directMode:
				944	/*
				945	* In Direct Mode, US-ASCII characters are encoded directly, i.e.,
				946	* with their US-ASCII byte values.
				947	* An ampersand starts Unicode (or "escape") Mode.
				948	*
				949	* In Direct Mode, only the sourceIndex is used.
				950	*/
				951	byteIndex=0;
				952	length=(int32_t)(sourceLimit-source);
				953	targetCapacity=(int32_t)(targetLimit-target);
				954	if(length>targetCapacity) {
				955	length=targetCapacity;
				956	}
				957	while(length>0) {
				958	b=*source++;
				959	if(!isLegalIMAP(b)) {
				960	/* illegal */
				961	bytes[0]=b;
				962	byteIndex=1;
				963	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				964	break;
				965	} else if(b!=AMPERSAND) {
				966	/* write directly encoded character */
				967	*target++=b;
				968	if(offsets!=NULL) {
				969	*offsets++=sourceIndex++;
				970	}
				971	} else /* AMPERSAND */ {
				972	/* switch to Unicode mode */
				973	nextSourceIndex=++sourceIndex;
				974	inDirectMode=FALSE;
				975	byteIndex=0;
				976	bits=0;
				977	base64Counter=-1;
				978	goto unicodeMode;
				979	}
				980	--length;
				981	}
				982	if(source<sourceLimit && target>=targetLimit) {
				983	/* target is full */
				984	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				985	}
				986	} else {
				987	unicodeMode:
				988	/*
				989	* In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
				990	* The base64 sequence ends with any character that is not in the base64 alphabet.
				991	* A terminating minus sign is consumed.
				992	* US-ASCII must not be base64-ed.
				993	*
				994	* In Unicode Mode, the sourceIndex has the index to the start of the current
				995	* base64 bytes, while nextSourceIndex is precisely parallel to source,
				996	* keeping the index to the following byte.
				997	* Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
				998	*/
				999	while(source<sourceLimit) {
				1000	if(target<targetLimit) {
				1001	bytes[byteIndex++]=b=*source++;
				1002	++nextSourceIndex;
				1003	if(b>0x7e) {
				1004	/* illegal - test other illegal US-ASCII values by base64Value==-3 */
				1005	inDirectMode=TRUE;
				1006	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1007	break;
				1008	} else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
				1009	/* collect base64 bytes into UChars */
				1010	switch(base64Counter) {
				1011	case -1: /* -1 is immediately after the & */
				1012	case 0:
				1013	bits=base64Value;
				1014	base64Counter=1;
				1015	break;
				1016	case 1:
				1017	case 3:
				1018	case 4:
				1019	case 6:
				1020	bits=(uint16_t)((bits<<6)\|base64Value);
				1021	++base64Counter;
				1022	break;
				1023	case 2:
				1024	c=(UChar)((bits<<4)\|(base64Value>>2));
				1025	if(isLegalIMAP(c)) {
				1026	/* illegal */
				1027	inDirectMode=TRUE;
				1028	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1029	goto endloop;
				1030	}
				1031	*target++=c;
				1032	if(offsets!=NULL) {
				1033	*offsets++=sourceIndex;
				1034	sourceIndex=nextSourceIndex-1;
				1035	}
				1036	bytes[0]=b; /* keep this byte in case an error occurs */
				1037	byteIndex=1;
				1038	bits=(uint16_t)(base64Value&3);
				1039	base64Counter=3;
				1040	break;
				1041	case 5:
				1042	c=(UChar)((bits<<2)\|(base64Value>>4));
				1043	if(isLegalIMAP(c)) {
				1044	/* illegal */
				1045	inDirectMode=TRUE;
				1046	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1047	goto endloop;
				1048	}
				1049	*target++=c;
				1050	if(offsets!=NULL) {
				1051	*offsets++=sourceIndex;
				1052	sourceIndex=nextSourceIndex-1;
				1053	}
				1054	bytes[0]=b; /* keep this byte in case an error occurs */
				1055	byteIndex=1;
				1056	bits=(uint16_t)(base64Value&15);
				1057	base64Counter=6;
				1058	break;
				1059	case 7:
				1060	c=(UChar)((bits<<6)\|base64Value);
				1061	if(isLegalIMAP(c)) {
				1062	/* illegal */
				1063	inDirectMode=TRUE;
				1064	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1065	goto endloop;
				1066	}
				1067	*target++=c;
				1068	if(offsets!=NULL) {
				1069	*offsets++=sourceIndex;
				1070	sourceIndex=nextSourceIndex;
				1071	}
				1072	byteIndex=0;
				1073	bits=0;
				1074	base64Counter=0;
				1075	break;
				1076	default:
				1077	/* will never occur */
				1078	break;
				1079	}
				1080	} else if(base64Value==-2) {
				1081	/* minus sign terminates the base64 sequence */
				1082	inDirectMode=TRUE;
				1083	if(base64Counter==-1) {
				1084	/* &- i.e. a minus immediately following an ampersand */
				1085	*target++=AMPERSAND;
				1086	if(offsets!=NULL) {
				1087	*offsets++=sourceIndex-1;
				1088	}
				1089	} else {
				1090	/* absorb the minus and leave the Unicode Mode */
				1091	if(bits!=0 \|\| (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
				1092	/* bits are illegally left over, a UChar is incomplete */
				1093	/* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
				1094	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1095	break;
				1096	}
				1097	}
				1098	sourceIndex=nextSourceIndex;
				1099	goto directMode;
				1100	} else {
				1101	if(base64Counter==-1) {
				1102	/* illegal: & immediately followed by something other than base64 or minus sign */
				1103	/* include the ampersand in the reported sequence */
				1104	--sourceIndex;
				1105	bytes[0]=AMPERSAND;
				1106	bytes[1]=b;
				1107	byteIndex=2;
				1108	}
				1109	/* base64Value==-1 for characters that are illegal only in Unicode mode */
				1110	/* base64Value==-3 for illegal characters */
				1111	/* illegal */
				1112	inDirectMode=TRUE;
				1113	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1114	break;
				1115	}
				1116	} else {
				1117	/* target is full */
				1118	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1119	break;
				1120	}
				1121	}
				1122	}
				1123	endloop:
				1124
				1125	/*
				1126	* the end of the input stream and detection of truncated input
				1127	* are handled by the framework, but here we must check if we are in Unicode
				1128	* mode and byteIndex==0 because we must end in direct mode
				1129	*
				1130	* conditions:
				1131	* successful
				1132	* in Unicode mode and byteIndex==0
				1133	* end of input and no truncated input
				1134	*/
				1135	if( U_SUCCESS(*pErrorCode) &&
				1136	!inDirectMode && byteIndex==0 &&
				1137	pArgs->flush && source>=sourceLimit
				1138	) {
				1139	if(base64Counter==-1) {
				1140	/* & at the very end of the input */
				1141	/* make the ampersand the reported sequence */
				1142	bytes[0]=AMPERSAND;
				1143	byteIndex=1;
				1144	}
				1145	/* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
				1146
				1147	inDirectMode=TRUE; /* avoid looping */
				1148	*pErrorCode=U_TRUNCATED_CHAR_FOUND;
				1149	}
				1150
				1151	/* set the converter state back into UConverter */
				1152	cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)\|((uint32_t)((uint8_t)base64Counter)<<16)\|(uint32_t)bits;
				1153	cnv->toULength=byteIndex;
				1154
				1155	/* write back the updated pointers */
				1156	pArgs->source=(const char *)source;
				1157	pArgs->target=target;
				1158	pArgs->offsets=offsets;
				1159	return;
				1160	}
				1161
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1162	static void U_CALLCONV
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1163	_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				1164	UErrorCode *pErrorCode) {
				1165	UConverter *cnv;
				1166	const UChar source, sourceLimit;
				1167	uint8_t target, targetLimit;
				1168	int32_t *offsets;
				1169
				1170	int32_t length, targetCapacity, sourceIndex;
				1171	UChar c;
				1172	uint8_t b;
				1173
				1174	/* UTF-7 state */
				1175	uint8_t bits;
				1176	int8_t base64Counter;
				1177	UBool inDirectMode;
				1178
				1179	/* set up the local pointers */
				1180	cnv=pArgs->converter;
				1181
				1182	/* set up the local pointers */
				1183	source=pArgs->source;
				1184	sourceLimit=pArgs->sourceLimit;
				1185	target=(uint8_t *)pArgs->target;
				1186	targetLimit=(uint8_t *)pArgs->targetLimit;
				1187	offsets=pArgs->offsets;
				1188
				1189	/* get the state machine state */
				1190	{
				1191	uint32_t status=cnv->fromUnicodeStatus;
				1192	inDirectMode=(UBool)((status>>24)&1);
				1193	base64Counter=(int8_t)(status>>16);
				1194	bits=(uint8_t)status;
				1195	}
				1196
				1197	/* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
				1198	sourceIndex=0;
				1199
				1200	if(inDirectMode) {
				1201	directMode:
				1202	length=(int32_t)(sourceLimit-source);
				1203	targetCapacity=(int32_t)(targetLimit-target);
				1204	if(length>targetCapacity) {
				1205	length=targetCapacity;
				1206	}
				1207	while(length>0) {
				1208	c=*source++;
				1209	/* encode 0x20..0x7e except '&' directly */
				1210	if(inSetDIMAP(c)) {
				1211	/* encode directly */
				1212	*target++=(uint8_t)c;
				1213	if(offsets!=NULL) {
				1214	*offsets++=sourceIndex++;
				1215	}
				1216	} else if(c==AMPERSAND) {
				1217	/* output &- for & */
				1218	*target++=AMPERSAND;
				1219	if(target<targetLimit) {
				1220	*target++=MINUS;
				1221	if(offsets!=NULL) {
				1222	*offsets++=sourceIndex;
				1223	*offsets++=sourceIndex++;
				1224	}
				1225	/* realign length and targetCapacity */
				1226	goto directMode;
				1227	} else {
				1228	if(offsets!=NULL) {
				1229	*offsets++=sourceIndex++;
				1230	}
				1231	cnv->charErrorBuffer[0]=MINUS;
				1232	cnv->charErrorBufferLength=1;
				1233	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1234	break;
				1235	}
				1236	} else {
				1237	/* un-read this character and switch to Unicode Mode */
				1238	--source;
				1239	*target++=AMPERSAND;
				1240	if(offsets!=NULL) {
				1241	*offsets++=sourceIndex;
				1242	}
				1243	inDirectMode=FALSE;
				1244	base64Counter=0;
				1245	goto unicodeMode;
				1246	}
				1247	--length;
				1248	}
				1249	if(source<sourceLimit && target>=targetLimit) {
				1250	/* target is full */
				1251	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1252	}
				1253	} else {
				1254	unicodeMode:
				1255	while(source<sourceLimit) {
				1256	if(target<targetLimit) {
				1257	c=*source++;
				1258	if(isLegalIMAP(c)) {
				1259	/* encode directly */
				1260	inDirectMode=TRUE;
				1261
				1262	/* trick: back out this character to make this easier */
				1263	--source;
				1264
				1265	/* terminate the base64 sequence */
				1266	if(base64Counter!=0) {
				1267	/* write remaining bits for the previous character */
				1268	*target++=TO_BASE64_IMAP(bits);
				1269	if(offsets!=NULL) {
				1270	*offsets++=sourceIndex-1;
				1271	}
				1272	}
				1273	/* need to terminate with a minus */
				1274	if(target<targetLimit) {
				1275	*target++=MINUS;
				1276	if(offsets!=NULL) {
				1277	*offsets++=sourceIndex-1;
				1278	}
				1279	} else {
				1280	cnv->charErrorBuffer[0]=MINUS;
				1281	cnv->charErrorBufferLength=1;
				1282	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1283	break;
				1284	}
				1285	goto directMode;
				1286	} else {
				1287	/*
				1288	* base64 this character:
				1289	* Output 2 or 3 base64 bytes for the remaining bits of the previous character
				1290	* and the bits of this character, each implicitly in UTF-16BE.
				1291	*
				1292	* Here, bits is an 8-bit variable because only 6 bits need to be kept from one
				1293	* character to the next. The actual 2 or 4 bits are shifted to the left edge
				1294	* of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
				1295	*/
				1296	switch(base64Counter) {
				1297	case 0:
				1298	b=(uint8_t)(c>>10);
				1299	*target++=TO_BASE64_IMAP(b);
				1300	if(target<targetLimit) {
				1301	b=(uint8_t)((c>>4)&0x3f);
				1302	*target++=TO_BASE64_IMAP(b);
				1303	if(offsets!=NULL) {
				1304	*offsets++=sourceIndex;
				1305	*offsets++=sourceIndex++;
				1306	}
				1307	} else {
				1308	if(offsets!=NULL) {
				1309	*offsets++=sourceIndex++;
				1310	}
				1311	b=(uint8_t)((c>>4)&0x3f);
				1312	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1313	cnv->charErrorBufferLength=1;
				1314	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1315	}
				1316	bits=(uint8_t)((c&15)<<2);
				1317	base64Counter=1;
				1318	break;
				1319	case 1:
				1320	b=(uint8_t)(bits\|(c>>14));
				1321	*target++=TO_BASE64_IMAP(b);
				1322	if(target<targetLimit) {
				1323	b=(uint8_t)((c>>8)&0x3f);
				1324	*target++=TO_BASE64_IMAP(b);
				1325	if(target<targetLimit) {
				1326	b=(uint8_t)((c>>2)&0x3f);
				1327	*target++=TO_BASE64_IMAP(b);
				1328	if(offsets!=NULL) {
				1329	*offsets++=sourceIndex;
				1330	*offsets++=sourceIndex;
				1331	*offsets++=sourceIndex++;
				1332	}
				1333	} else {
				1334	if(offsets!=NULL) {
				1335	*offsets++=sourceIndex;
				1336	*offsets++=sourceIndex++;
				1337	}
				1338	b=(uint8_t)((c>>2)&0x3f);
				1339	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1340	cnv->charErrorBufferLength=1;
				1341	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1342	}
				1343	} else {
				1344	if(offsets!=NULL) {
				1345	*offsets++=sourceIndex++;
				1346	}
				1347	b=(uint8_t)((c>>8)&0x3f);
				1348	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1349	b=(uint8_t)((c>>2)&0x3f);
				1350	cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
				1351	cnv->charErrorBufferLength=2;
				1352	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1353	}
				1354	bits=(uint8_t)((c&3)<<4);
				1355	base64Counter=2;
				1356	break;
				1357	case 2:
				1358	b=(uint8_t)(bits\|(c>>12));
				1359	*target++=TO_BASE64_IMAP(b);
				1360	if(target<targetLimit) {
				1361	b=(uint8_t)((c>>6)&0x3f);
				1362	*target++=TO_BASE64_IMAP(b);
				1363	if(target<targetLimit) {
				1364	b=(uint8_t)(c&0x3f);
				1365	*target++=TO_BASE64_IMAP(b);
				1366	if(offsets!=NULL) {
				1367	*offsets++=sourceIndex;
				1368	*offsets++=sourceIndex;
				1369	*offsets++=sourceIndex++;
				1370	}
				1371	} else {
				1372	if(offsets!=NULL) {
				1373	*offsets++=sourceIndex;
				1374	*offsets++=sourceIndex++;
				1375	}
				1376	b=(uint8_t)(c&0x3f);
				1377	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1378	cnv->charErrorBufferLength=1;
				1379	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1380	}
				1381	} else {
				1382	if(offsets!=NULL) {
				1383	*offsets++=sourceIndex++;
				1384	}
				1385	b=(uint8_t)((c>>6)&0x3f);
				1386	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1387	b=(uint8_t)(c&0x3f);
				1388	cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
				1389	cnv->charErrorBufferLength=2;
				1390	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1391	}
				1392	bits=0;
				1393	base64Counter=0;
				1394	break;
				1395	default:
				1396	/* will never occur */
				1397	break;
				1398	}
				1399	}
				1400	} else {
				1401	/* target is full */
				1402	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1403	break;
				1404	}
				1405	}
				1406	}
				1407
				1408	if(pArgs->flush && source>=sourceLimit) {
				1409	/* flush remaining bits to the target */
				1410	if(!inDirectMode) {
				1411	if(base64Counter!=0) {
				1412	if(target<targetLimit) {
				1413	*target++=TO_BASE64_IMAP(bits);
				1414	if(offsets!=NULL) {
				1415	*offsets++=sourceIndex-1;
				1416	}
				1417	} else {
				1418	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
				1419	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1420	}
				1421	}
				1422	/* need to terminate with a minus */
				1423	if(target<targetLimit) {
				1424	*target++=MINUS;
				1425	if(offsets!=NULL) {
				1426	*offsets++=sourceIndex-1;
				1427	}
				1428	} else {
				1429	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
				1430	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1431	}
				1432	}
				1433	/* reset the state for the next conversion */
				1434	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)\|0x1000000; /* keep version, inDirectMode=TRUE */
				1435	} else {
				1436	/* set the converter state back into UConverter */
				1437	cnv->fromUnicodeStatus=
				1438	(cnv->fromUnicodeStatus&0xf0000000)\| /* keep version*/
				1439	((uint32_t)inDirectMode<<24)\|((uint32_t)base64Counter<<16)\|(uint32_t)bits;
				1440	}
				1441
				1442	/* write back the updated pointers */
				1443	pArgs->source=source;
				1444	pArgs->target=(char *)target;
				1445	pArgs->offsets=offsets;
				1446	return;
				1447	}
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1448	U_CDECL_END
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1449
				1450	static const UConverterImpl _IMAPImpl={
				1451	UCNV_IMAP_MAILBOX,
				1452
				1453	NULL,
				1454	NULL,
				1455
				1456	_UTF7Open,
				1457	NULL,
				1458	_UTF7Reset,
				1459
				1460	_IMAPToUnicodeWithOffsets,
				1461	_IMAPToUnicodeWithOffsets,
				1462	_IMAPFromUnicodeWithOffsets,
				1463	_IMAPFromUnicodeWithOffsets,
				1464	NULL,
				1465
				1466	NULL,
				1467	NULL,
				1468	NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
				1469	NULL,
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1470	ucnv_getCompleteUnicodeSet,
				1471	NULL,
				1472	NULL
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1473	};
				1474
				1475	static const UConverterStaticData _IMAPStaticData={
				1476	sizeof(UConverterStaticData),
				1477	"IMAP-mailbox-name",
				1478	0, /* TODO CCSID for IMAP-mailbox-name */
				1479	UCNV_IBM, UCNV_IMAP_MAILBOX,
				1480	1, 4,
				1481	{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
				1482	FALSE, FALSE,
				1483	0,
				1484	0,
				1485	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1486	};
				1487
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	1488	const UConverterSharedData _IMAPData=
				1489	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1490
				1491	#endif