Blame - source/common/ucnv_u7.c - chromium.googlesource.com/chromium/deps/icu

blob: d35bae2160607c1a6847070002f8837100b83437 [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1	/*
				2	**********************************************************************
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	3	* Copyright (C) 2002-2015, International Business Machines
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	4	* Corporation and others. All Rights Reserved.
				5	**********************************************************************
				6	* file name: ucnv_u7.c
				7	* encoding: US-ASCII
				8	* tab size: 8 (not used)
				9	* indentation:4
				10	*
				11	* created on: 2002jul01
				12	* created by: Markus W. Scherer
				13	*
				14	* UTF-7 converter implementation. Used to be in ucnv_utf.c.
				15	*/
				16
				17	#include "unicode/utypes.h"
				18
Jungshik Shin	70f8250	2016-01-29 00:32:36 -0800	[diff] [blame^]	19	#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	20
				21	#include "unicode/ucnv.h"
				22	#include "ucnv_bld.h"
				23	#include "ucnv_cnv.h"
				24	#include "uassert.h"
				25
				26	/* UTF-7 -------------------------------------------------------------------- */
				27
				28	/*
				29	* UTF-7 is a stateful encoding of Unicode.
				30	* It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
				31	* It was intended for use in Internet email systems, using in its bytewise
				32	* encoding only a subset of 7-bit US-ASCII.
				33	* UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
				34	* occasionally used.
				35	*
				36	* For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
				37	* characters directly or in base64. Especially, the characters in set O
				38	* as defined in the RFC (see below) may be encoded directly but are not
				39	* allowed in, e.g., email headers.
				40	* By default, the ICU UTF-7 converter encodes set O directly.
				41	* By choosing the option "version=1", set O will be escaped instead.
				42	* For example:
				43	* utf7Converter=ucnv_open("UTF-7,version=1");
				44	*
				45	* For details about email headers see RFC 2047.
				46	*/
				47
				48	/*
				49	* Tests for US-ASCII characters belonging to character classes
				50	* defined in UTF-7.
				51	*
				52	* Set D (directly encoded characters) consists of the following
				53	* characters: the upper and lower case letters A through Z
				54	* and a through z, the 10 digits 0-9, and the following nine special
				55	* characters (note that "+" and "=" are omitted):
				56	* '(),-./:?
				57	*
				58	* Set O (optional direct characters) consists of the following
				59	* characters (note that "\" and "~" are omitted):
				60	* !"#$%&*;<=>@[]^_`{\|}
				61	*
				62	* According to the rules in RFC 2152, the byte values for the following
				63	* US-ASCII characters are not used in UTF-7 and are therefore illegal:
				64	* - all C0 control codes except for CR LF TAB
				65	* - BACKSLASH
				66	* - TILDE
				67	* - DEL
				68	* - all codes beyond US-ASCII, i.e. all >127
				69	*/
				70	#define inSetD(c) \
				71	((uint8_t)((c)-97)<26 \|\| (uint8_t)((c)-65)<26 \|\| /* letters */ \
				72	(uint8_t)((c)-48)<10 \|\| /* digits */ \
				73	(uint8_t)((c)-39)<3 \|\| /* '() */ \
				74	(uint8_t)((c)-44)<4 \|\| /* ,-./ */ \
				75	(c)==58 \|\| (c)==63 /* :? */ \
				76	)
				77
				78	#define inSetO(c) \
				79	((uint8_t)((c)-33)<6 \|\| /* !"#$%& */ \
				80	(uint8_t)((c)-59)<4 \|\| /* ;<=> */ \
				81	(uint8_t)((c)-93)<4 \|\| /* ]^_` */ \
				82	(uint8_t)((c)-123)<3 \|\| /* {\|} */ \
				83	(c)==42 \|\| (c)==64 \|\| (c)==91 /* @[ / \
				84	)
				85
				86	#define isCRLFTAB(c) ((c)==13 \|\| (c)==10 \|\| (c)==9)
				87	#define isCRLFSPTAB(c) ((c)==32 \|\| (c)==13 \|\| (c)==10 \|\| (c)==9)
				88
				89	#define PLUS 43
				90	#define MINUS 45
				91	#define BACKSLASH 92
				92	#define TILDE 126
				93
				94	/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
				95	#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) \|\| isCRLFTAB(c))
				96
				97	/* encode directly sets D and O and CR LF SP TAB */
				98	static const UBool encodeDirectlyMaximum[128]={
				99	/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
				100	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
				101	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				102
				103	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
				104	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				105
				106	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				107	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
				108
				109	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				110	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
				111	};
				112
				113	/* encode directly set D and CR LF SP TAB but not set O */
				114	static const UBool encodeDirectlyRestricted[128]={
				115	/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
				116	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
				117	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				118
				119	1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
				120	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
				121
				122	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				123	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
				124
				125	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				126	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
				127	};
				128
				129	static const uint8_t
				130	toBase64[64]={
				131	/* A-Z */
				132	65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
				133	78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
				134	/* a-z */
				135	97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
				136	110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
				137	/* 0-9 */
				138	48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
				139	/* +/ */
				140	43, 47
				141	};
				142
				143	static const int8_t
				144	fromBase64[128]={
				145	/* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
				146	-3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
				147	-3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
				148
				149	/* general punctuation with + and / and a special value (-2) for - */
				150	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
				151	/* digits */
				152	52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
				153
				154	/* A-Z */
				155	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
				156	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
				157
				158	/* a-z */
				159	-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
				160	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
				161	};
				162
				163	/*
				164	* converter status values:
				165	*
				166	* toUnicodeStatus:
				167	* 24 inDirectMode (boolean)
				168	* 23..16 base64Counter (-1..7)
				169	* 15..0 bits (up to 14 bits incoming base64)
				170	*
				171	* fromUnicodeStatus:
				172	* 31..28 version (0: set O direct 1: set O escaped)
				173	* 24 inDirectMode (boolean)
				174	* 23..16 base64Counter (0..2)
				175	* 7..0 bits (6 bits outgoing base64)
				176	*
				177	*/
				178
				179	static void
				180	_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
				181	if(choice<=UCNV_RESET_TO_UNICODE) {
				182	/* reset toUnicode */
				183	cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
				184	cnv->toULength=0;
				185	}
				186	if(choice!=UCNV_RESET_TO_UNICODE) {
				187	/* reset fromUnicode */
				188	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)\|0x1000000; /* keep version, inDirectMode=TRUE */
				189	}
				190	}
				191
				192	static void
				193	_UTF7Open(UConverter *cnv,
				194	UConverterLoadArgs *pArgs,
				195	UErrorCode *pErrorCode) {
				196	if(UCNV_GET_VERSION(cnv)<=1) {
				197	/* TODO(markus): Should just use cnv->options rather than copying the version number. */
				198	cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
				199	_UTF7Reset(cnv, UCNV_RESET_BOTH);
				200	} else {
				201	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
				202	}
				203	}
				204
				205	static void
				206	_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				207	UErrorCode *pErrorCode) {
				208	UConverter *cnv;
				209	const uint8_t source, sourceLimit;
				210	UChar *target;
				211	const UChar *targetLimit;
				212	int32_t *offsets;
				213
				214	uint8_t *bytes;
				215	uint8_t byteIndex;
				216
				217	int32_t length, targetCapacity;
				218
				219	/* UTF-7 state */
				220	uint16_t bits;
				221	int8_t base64Counter;
				222	UBool inDirectMode;
				223
				224	int8_t base64Value;
				225
				226	int32_t sourceIndex, nextSourceIndex;
				227
				228	uint8_t b;
				229	/* set up the local pointers */
				230	cnv=pArgs->converter;
				231
				232	source=(const uint8_t *)pArgs->source;
				233	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				234	target=pArgs->target;
				235	targetLimit=pArgs->targetLimit;
				236	offsets=pArgs->offsets;
				237	/* get the state machine state */
				238	{
				239	uint32_t status=cnv->toUnicodeStatus;
				240	inDirectMode=(UBool)((status>>24)&1);
				241	base64Counter=(int8_t)(status>>16);
				242	bits=(uint16_t)status;
				243	}
				244	bytes=cnv->toUBytes;
				245	byteIndex=cnv->toULength;
				246
				247	/* sourceIndex=-1 if the current character began in the previous buffer */
				248	sourceIndex=byteIndex==0 ? 0 : -1;
				249	nextSourceIndex=0;
				250
				251	if(inDirectMode) {
				252	directMode:
				253	/*
				254	* In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
				255	* with their US-ASCII byte values.
				256	* Backslash and Tilde and most control characters are not allowed in UTF-7.
				257	* A plus sign starts Unicode (or "escape") Mode.
				258	*
				259	* In Direct Mode, only the sourceIndex is used.
				260	*/
				261	byteIndex=0;
				262	length=(int32_t)(sourceLimit-source);
				263	targetCapacity=(int32_t)(targetLimit-target);
				264	if(length>targetCapacity) {
				265	length=targetCapacity;
				266	}
				267	while(length>0) {
				268	b=*source++;
				269	if(!isLegalUTF7(b)) {
				270	/* illegal */
				271	bytes[0]=b;
				272	byteIndex=1;
				273	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				274	break;
				275	} else if(b!=PLUS) {
				276	/* write directly encoded character */
				277	*target++=b;
				278	if(offsets!=NULL) {
				279	*offsets++=sourceIndex++;
				280	}
				281	} else /* PLUS */ {
				282	/* switch to Unicode mode */
				283	nextSourceIndex=++sourceIndex;
				284	inDirectMode=FALSE;
				285	byteIndex=0;
				286	bits=0;
				287	base64Counter=-1;
				288	goto unicodeMode;
				289	}
				290	--length;
				291	}
				292	if(source<sourceLimit && target>=targetLimit) {
				293	/* target is full */
				294	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				295	}
				296	} else {
				297	unicodeMode:
				298	/*
				299	* In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
				300	* The base64 sequence ends with any character that is not in the base64 alphabet.
				301	* A terminating minus sign is consumed.
				302	*
				303	* In Unicode Mode, the sourceIndex has the index to the start of the current
				304	* base64 bytes, while nextSourceIndex is precisely parallel to source,
				305	* keeping the index to the following byte.
				306	* Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
				307	*/
				308	while(source<sourceLimit) {
				309	if(target<targetLimit) {
				310	bytes[byteIndex++]=b=*source++;
				311	++nextSourceIndex;
				312	base64Value = -3; /* initialize as illegal */
				313	if(b>=126 \|\| (base64Value=fromBase64[b])==-3 \|\| base64Value==-1) {
				314	/* either
				315	* base64Value==-1 for any legal character except base64 and minus sign, or
				316	* base64Value==-3 for illegal characters:
				317	* 1. In either case, leave Unicode mode.
				318	* 2.1. If we ended with an incomplete UChar or none after the +, then
				319	* generate an error for the preceding erroneous sequence and deal with
				320	* the current (possibly illegal) character next time through.
				321	* 2.2. Else the current char comes after a complete UChar, which was already
				322	* pushed to the output buf, so:
				323	* 2.2.1. If the current char is legal, just save it for processing next time.
				324	* It may be for example, a plus which we need to deal with in direct mode.
				325	* 2.2.2. Else if the current char is illegal, we might as well deal with it here.
				326	*/
				327	inDirectMode=TRUE;
				328	if(base64Counter==-1) {
				329	/* illegal: + immediately followed by something other than base64 or minus sign */
				330	/* include the plus sign in the reported sequence, but not the subsequent char */
				331	--source;
				332	bytes[0]=PLUS;
				333	byteIndex=1;
				334	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				335	break;
				336	} else if(bits!=0) {
				337	/* bits are illegally left over, a UChar is incomplete */
				338	/* don't include current char (legal or illegal) in error seq */
				339	--source;
				340	--byteIndex;
				341	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				342	break;
				343	} else {
				344	/* previous UChar was complete */
				345	if(base64Value==-3) {
				346	/* current character is illegal, deal with it here */
				347	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				348	break;
				349	} else {
				350	/* un-read the current character in case it is a plus sign */
				351	--source;
				352	sourceIndex=nextSourceIndex-1;
				353	goto directMode;
				354	}
				355	}
				356	} else if(base64Value>=0) {
				357	/* collect base64 bytes into UChars */
				358	switch(base64Counter) {
				359	case -1: /* -1 is immediately after the + */
				360	case 0:
				361	bits=base64Value;
				362	base64Counter=1;
				363	break;
				364	case 1:
				365	case 3:
				366	case 4:
				367	case 6:
				368	bits=(uint16_t)((bits<<6)\|base64Value);
				369	++base64Counter;
				370	break;
				371	case 2:
				372	*target++=(UChar)((bits<<4)\|(base64Value>>2));
				373	if(offsets!=NULL) {
				374	*offsets++=sourceIndex;
				375	sourceIndex=nextSourceIndex-1;
				376	}
				377	bytes[0]=b; /* keep this byte in case an error occurs */
				378	byteIndex=1;
				379	bits=(uint16_t)(base64Value&3);
				380	base64Counter=3;
				381	break;
				382	case 5:
				383	*target++=(UChar)((bits<<2)\|(base64Value>>4));
				384	if(offsets!=NULL) {
				385	*offsets++=sourceIndex;
				386	sourceIndex=nextSourceIndex-1;
				387	}
				388	bytes[0]=b; /* keep this byte in case an error occurs */
				389	byteIndex=1;
				390	bits=(uint16_t)(base64Value&15);
				391	base64Counter=6;
				392	break;
				393	case 7:
				394	*target++=(UChar)((bits<<6)\|base64Value);
				395	if(offsets!=NULL) {
				396	*offsets++=sourceIndex;
				397	sourceIndex=nextSourceIndex;
				398	}
				399	byteIndex=0;
				400	bits=0;
				401	base64Counter=0;
				402	break;
				403	default:
				404	/* will never occur */
				405	break;
				406	}
				407	} else /base64Value==-2/ {
				408	/* minus sign terminates the base64 sequence */
				409	inDirectMode=TRUE;
				410	if(base64Counter==-1) {
				411	/* +- i.e. a minus immediately following a plus */
				412	*target++=PLUS;
				413	if(offsets!=NULL) {
				414	*offsets++=sourceIndex-1;
				415	}
				416	} else {
				417	/* absorb the minus and leave the Unicode Mode */
				418	if(bits!=0) {
				419	/* bits are illegally left over, a UChar is incomplete */
				420	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				421	break;
				422	}
				423	}
				424	sourceIndex=nextSourceIndex;
				425	goto directMode;
				426	}
				427	} else {
				428	/* target is full */
				429	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				430	break;
				431	}
				432	}
				433	}
				434
				435	if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
				436	/*
				437	* if we are in Unicode mode, then the byteIndex might not be 0,
				438	* but that is ok if bits==0
				439	* -> we set byteIndex=0 at the end of the stream to avoid a truncated error
				440	* (not true for IMAP-mailbox-name where we must end in direct mode)
				441	*/
				442	byteIndex=0;
				443	}
				444
				445	/* set the converter state back into UConverter */
				446	cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)\|((uint32_t)((uint8_t)base64Counter)<<16)\|(uint32_t)bits;
				447	cnv->toULength=byteIndex;
				448
				449	/* write back the updated pointers */
				450	pArgs->source=(const char *)source;
				451	pArgs->target=target;
				452	pArgs->offsets=offsets;
				453	return;
				454	}
				455
				456	static void
				457	_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				458	UErrorCode *pErrorCode) {
				459	UConverter *cnv;
				460	const UChar source, sourceLimit;
				461	uint8_t target, targetLimit;
				462	int32_t *offsets;
				463
				464	int32_t length, targetCapacity, sourceIndex;
				465	UChar c;
				466
				467	/* UTF-7 state */
				468	const UBool *encodeDirectly;
				469	uint8_t bits;
				470	int8_t base64Counter;
				471	UBool inDirectMode;
				472
				473	/* set up the local pointers */
				474	cnv=pArgs->converter;
				475
				476	/* set up the local pointers */
				477	source=pArgs->source;
				478	sourceLimit=pArgs->sourceLimit;
				479	target=(uint8_t *)pArgs->target;
				480	targetLimit=(uint8_t *)pArgs->targetLimit;
				481	offsets=pArgs->offsets;
				482
				483	/* get the state machine state */
				484	{
				485	uint32_t status=cnv->fromUnicodeStatus;
				486	encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
				487	inDirectMode=(UBool)((status>>24)&1);
				488	base64Counter=(int8_t)(status>>16);
				489	bits=(uint8_t)status;
				490	U_ASSERT(bits<=sizeof(toBase64)/sizeof(toBase64[0]));
				491	}
				492
				493	/* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
				494	sourceIndex=0;
				495
				496	if(inDirectMode) {
				497	directMode:
				498	length=(int32_t)(sourceLimit-source);
				499	targetCapacity=(int32_t)(targetLimit-target);
				500	if(length>targetCapacity) {
				501	length=targetCapacity;
				502	}
				503	while(length>0) {
				504	c=*source++;
				505	/* currently always encode CR LF SP TAB directly */
				506	if(c<=127 && encodeDirectly[c]) {
				507	/* encode directly */
				508	*target++=(uint8_t)c;
				509	if(offsets!=NULL) {
				510	*offsets++=sourceIndex++;
				511	}
				512	} else if(c==PLUS) {
				513	/* output +- for + */
				514	*target++=PLUS;
				515	if(target<targetLimit) {
				516	*target++=MINUS;
				517	if(offsets!=NULL) {
				518	*offsets++=sourceIndex;
				519	*offsets++=sourceIndex++;
				520	}
				521	/* realign length and targetCapacity */
				522	goto directMode;
				523	} else {
				524	if(offsets!=NULL) {
				525	*offsets++=sourceIndex++;
				526	}
				527	cnv->charErrorBuffer[0]=MINUS;
				528	cnv->charErrorBufferLength=1;
				529	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				530	break;
				531	}
				532	} else {
				533	/* un-read this character and switch to Unicode Mode */
				534	--source;
				535	*target++=PLUS;
				536	if(offsets!=NULL) {
				537	*offsets++=sourceIndex;
				538	}
				539	inDirectMode=FALSE;
				540	base64Counter=0;
				541	goto unicodeMode;
				542	}
				543	--length;
				544	}
				545	if(source<sourceLimit && target>=targetLimit) {
				546	/* target is full */
				547	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				548	}
				549	} else {
				550	unicodeMode:
				551	while(source<sourceLimit) {
				552	if(target<targetLimit) {
				553	c=*source++;
				554	if(c<=127 && encodeDirectly[c]) {
				555	/* encode directly */
				556	inDirectMode=TRUE;
				557
				558	/* trick: back out this character to make this easier */
				559	--source;
				560
				561	/* terminate the base64 sequence */
				562	if(base64Counter!=0) {
				563	/* write remaining bits for the previous character */
				564	*target++=toBase64[bits];
				565	if(offsets!=NULL) {
				566	*offsets++=sourceIndex-1;
				567	}
				568	}
				569	if(fromBase64[c]!=-1) {
				570	/* need to terminate with a minus */
				571	if(target<targetLimit) {
				572	*target++=MINUS;
				573	if(offsets!=NULL) {
				574	*offsets++=sourceIndex-1;
				575	}
				576	} else {
				577	cnv->charErrorBuffer[0]=MINUS;
				578	cnv->charErrorBufferLength=1;
				579	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				580	break;
				581	}
				582	}
				583	goto directMode;
				584	} else {
				585	/*
				586	* base64 this character:
				587	* Output 2 or 3 base64 bytes for the remaining bits of the previous character
				588	* and the bits of this character, each implicitly in UTF-16BE.
				589	*
				590	* Here, bits is an 8-bit variable because only 6 bits need to be kept from one
				591	* character to the next. The actual 2 or 4 bits are shifted to the left edge
				592	* of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
				593	*/
				594	switch(base64Counter) {
				595	case 0:
				596	*target++=toBase64[c>>10];
				597	if(target<targetLimit) {
				598	*target++=toBase64[(c>>4)&0x3f];
				599	if(offsets!=NULL) {
				600	*offsets++=sourceIndex;
				601	*offsets++=sourceIndex++;
				602	}
				603	} else {
				604	if(offsets!=NULL) {
				605	*offsets++=sourceIndex++;
				606	}
				607	cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
				608	cnv->charErrorBufferLength=1;
				609	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				610	}
				611	bits=(uint8_t)((c&15)<<2);
				612	base64Counter=1;
				613	break;
				614	case 1:
				615	*target++=toBase64[bits\|(c>>14)];
				616	if(target<targetLimit) {
				617	*target++=toBase64[(c>>8)&0x3f];
				618	if(target<targetLimit) {
				619	*target++=toBase64[(c>>2)&0x3f];
				620	if(offsets!=NULL) {
				621	*offsets++=sourceIndex;
				622	*offsets++=sourceIndex;
				623	*offsets++=sourceIndex++;
				624	}
				625	} else {
				626	if(offsets!=NULL) {
				627	*offsets++=sourceIndex;
				628	*offsets++=sourceIndex++;
				629	}
				630	cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
				631	cnv->charErrorBufferLength=1;
				632	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				633	}
				634	} else {
				635	if(offsets!=NULL) {
				636	*offsets++=sourceIndex++;
				637	}
				638	cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
				639	cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
				640	cnv->charErrorBufferLength=2;
				641	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				642	}
				643	bits=(uint8_t)((c&3)<<4);
				644	base64Counter=2;
				645	break;
				646	case 2:
				647	*target++=toBase64[bits\|(c>>12)];
				648	if(target<targetLimit) {
				649	*target++=toBase64[(c>>6)&0x3f];
				650	if(target<targetLimit) {
				651	*target++=toBase64[c&0x3f];
				652	if(offsets!=NULL) {
				653	*offsets++=sourceIndex;
				654	*offsets++=sourceIndex;
				655	*offsets++=sourceIndex++;
				656	}
				657	} else {
				658	if(offsets!=NULL) {
				659	*offsets++=sourceIndex;
				660	*offsets++=sourceIndex++;
				661	}
				662	cnv->charErrorBuffer[0]=toBase64[c&0x3f];
				663	cnv->charErrorBufferLength=1;
				664	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				665	}
				666	} else {
				667	if(offsets!=NULL) {
				668	*offsets++=sourceIndex++;
				669	}
				670	cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
				671	cnv->charErrorBuffer[1]=toBase64[c&0x3f];
				672	cnv->charErrorBufferLength=2;
				673	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				674	}
				675	bits=0;
				676	base64Counter=0;
				677	break;
				678	default:
				679	/* will never occur */
				680	break;
				681	}
				682	}
				683	} else {
				684	/* target is full */
				685	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				686	break;
				687	}
				688	}
				689	}
				690
				691	if(pArgs->flush && source>=sourceLimit) {
				692	/* flush remaining bits to the target */
				693	if(!inDirectMode) {
				694	if (base64Counter!=0) {
				695	if(target<targetLimit) {
				696	*target++=toBase64[bits];
				697	if(offsets!=NULL) {
				698	*offsets++=sourceIndex-1;
				699	}
				700	} else {
				701	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
				702	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				703	}
				704	}
				705	/* Add final MINUS to terminate unicodeMode */
				706	if(target<targetLimit) {
				707	*target++=MINUS;
				708	if(offsets!=NULL) {
				709	*offsets++=sourceIndex-1;
				710	}
				711	} else {
				712	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
				713	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				714	}
				715	}
				716	/* reset the state for the next conversion */
				717	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)\|0x1000000; /* keep version, inDirectMode=TRUE */
				718	} else {
				719	/* set the converter state back into UConverter */
				720	cnv->fromUnicodeStatus=
				721	(cnv->fromUnicodeStatus&0xf0000000)\| /* keep version*/
				722	((uint32_t)inDirectMode<<24)\|((uint32_t)base64Counter<<16)\|(uint32_t)bits;
				723	}
				724
				725	/* write back the updated pointers */
				726	pArgs->source=source;
				727	pArgs->target=(char *)target;
				728	pArgs->offsets=offsets;
				729	return;
				730	}
				731
				732	static const char *
				733	_UTF7GetName(const UConverter *cnv) {
				734	switch(cnv->fromUnicodeStatus>>28) {
				735	case 1:
				736	return "UTF-7,version=1";
				737	default:
				738	return "UTF-7";
				739	}
				740	}
				741
				742	static const UConverterImpl _UTF7Impl={
				743	UCNV_UTF7,
				744
				745	NULL,
				746	NULL,
				747
				748	_UTF7Open,
				749	NULL,
				750	_UTF7Reset,
				751
				752	_UTF7ToUnicodeWithOffsets,
				753	_UTF7ToUnicodeWithOffsets,
				754	_UTF7FromUnicodeWithOffsets,
				755	_UTF7FromUnicodeWithOffsets,
				756	NULL,
				757
				758	NULL,
				759	_UTF7GetName,
				760	NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
				761	NULL,
				762	ucnv_getCompleteUnicodeSet
				763	};
				764
				765	static const UConverterStaticData _UTF7StaticData={
				766	sizeof(UConverterStaticData),
				767	"UTF-7",
				768	0, /* TODO CCSID for UTF-7 */
				769	UCNV_IBM, UCNV_UTF7,
				770	1, 4,
				771	{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
				772	FALSE, FALSE,
				773	0,
				774	0,
				775	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				776	};
				777
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	778	const UConverterSharedData _UTF7Data=
				779	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	780
				781	/* IMAP mailbox name encoding ----------------------------------------------- */
				782
				783	/*
				784	* RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
				785	* http://www.ietf.org/rfc/rfc2060.txt
				786	*
				787	* 5.1.3. Mailbox International Naming Convention
				788	*
				789	* By convention, international mailbox names are specified using a
				790	* modified version of the UTF-7 encoding described in [UTF-7]. The
				791	* purpose of these modifications is to correct the following problems
				792	* with UTF-7:
				793	*
				794	* 1) UTF-7 uses the "+" character for shifting; this conflicts with
				795	* the common use of "+" in mailbox names, in particular USENET
				796	* newsgroup names.
				797	*
				798	* 2) UTF-7's encoding is BASE64 which uses the "/" character; this
				799	* conflicts with the use of "/" as a popular hierarchy delimiter.
				800	*
				801	* 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
				802	* the use of "\" as a popular hierarchy delimiter.
				803	*
				804	* 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
				805	* the use of "~" in some servers as a home directory indicator.
				806	*
				807	* 5) UTF-7 permits multiple alternate forms to represent the same
				808	* string; in particular, printable US-ASCII chararacters can be
				809	* represented in encoded form.
				810	*
				811	* In modified UTF-7, printable US-ASCII characters except for "&"
				812	* represent themselves; that is, characters with octet values 0x20-0x25
				813	* and 0x27-0x7e. The character "&" (0x26) is represented by the two-
				814	* octet sequence "&-".
				815	*
				816	* All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
				817	* Unicode 16-bit octets) are represented in modified BASE64, with a
				818	* further modification from [UTF-7] that "," is used instead of "/".
				819	* Modified BASE64 MUST NOT be used to represent any printing US-ASCII
				820	* character which can represent itself.
				821	*
				822	* "&" is used to shift to modified BASE64 and "-" to shift back to US-
				823	* ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
				824	* is, a name that ends with a Unicode 16-bit octet MUST end with a "-
				825	* ").
				826	*
				827	* For example, here is a mailbox name which mixes English, Japanese,
				828	* and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
				829	*/
				830
				831	/*
				832	* Tests for US-ASCII characters belonging to character classes
				833	* defined in UTF-7.
				834	*
				835	* Set D (directly encoded characters) consists of the following
				836	* characters: the upper and lower case letters A through Z
				837	* and a through z, the 10 digits 0-9, and the following nine special
				838	* characters (note that "+" and "=" are omitted):
				839	* '(),-./:?
				840	*
				841	* Set O (optional direct characters) consists of the following
				842	* characters (note that "\" and "~" are omitted):
				843	* !"#$%&*;<=>@[]^_`{\|}
				844	*
				845	* According to the rules in RFC 2152, the byte values for the following
				846	* US-ASCII characters are not used in UTF-7 and are therefore illegal:
				847	* - all C0 control codes except for CR LF TAB
				848	* - BACKSLASH
				849	* - TILDE
				850	* - DEL
				851	* - all codes beyond US-ASCII, i.e. all >127
				852	*/
				853
				854	/* uses '&' not '+' to start a base64 sequence */
				855	#define AMPERSAND 0x26
				856	#define COMMA 0x2c
				857	#define SLASH 0x2f
				858
				859	/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
				860	#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
				861
				862	/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
				863	#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
				864
				865	#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
				866	#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
				867
				868	/*
				869	* converter status values:
				870	*
				871	* toUnicodeStatus:
				872	* 24 inDirectMode (boolean)
				873	* 23..16 base64Counter (-1..7)
				874	* 15..0 bits (up to 14 bits incoming base64)
				875	*
				876	* fromUnicodeStatus:
				877	* 24 inDirectMode (boolean)
				878	* 23..16 base64Counter (0..2)
				879	* 7..0 bits (6 bits outgoing base64)
				880	*
				881	* ignore bits 31..25
				882	*/
				883
				884	static void
				885	_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
				886	UErrorCode *pErrorCode) {
				887	UConverter *cnv;
				888	const uint8_t source, sourceLimit;
				889	UChar *target;
				890	const UChar *targetLimit;
				891	int32_t *offsets;
				892
				893	uint8_t *bytes;
				894	uint8_t byteIndex;
				895
				896	int32_t length, targetCapacity;
				897
				898	/* UTF-7 state */
				899	uint16_t bits;
				900	int8_t base64Counter;
				901	UBool inDirectMode;
				902
				903	int8_t base64Value;
				904
				905	int32_t sourceIndex, nextSourceIndex;
				906
				907	UChar c;
				908	uint8_t b;
				909
				910	/* set up the local pointers */
				911	cnv=pArgs->converter;
				912
				913	source=(const uint8_t *)pArgs->source;
				914	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
				915	target=pArgs->target;
				916	targetLimit=pArgs->targetLimit;
				917	offsets=pArgs->offsets;
				918	/* get the state machine state */
				919	{
				920	uint32_t status=cnv->toUnicodeStatus;
				921	inDirectMode=(UBool)((status>>24)&1);
				922	base64Counter=(int8_t)(status>>16);
				923	bits=(uint16_t)status;
				924	}
				925	bytes=cnv->toUBytes;
				926	byteIndex=cnv->toULength;
				927
				928	/* sourceIndex=-1 if the current character began in the previous buffer */
				929	sourceIndex=byteIndex==0 ? 0 : -1;
				930	nextSourceIndex=0;
				931
				932	if(inDirectMode) {
				933	directMode:
				934	/*
				935	* In Direct Mode, US-ASCII characters are encoded directly, i.e.,
				936	* with their US-ASCII byte values.
				937	* An ampersand starts Unicode (or "escape") Mode.
				938	*
				939	* In Direct Mode, only the sourceIndex is used.
				940	*/
				941	byteIndex=0;
				942	length=(int32_t)(sourceLimit-source);
				943	targetCapacity=(int32_t)(targetLimit-target);
				944	if(length>targetCapacity) {
				945	length=targetCapacity;
				946	}
				947	while(length>0) {
				948	b=*source++;
				949	if(!isLegalIMAP(b)) {
				950	/* illegal */
				951	bytes[0]=b;
				952	byteIndex=1;
				953	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				954	break;
				955	} else if(b!=AMPERSAND) {
				956	/* write directly encoded character */
				957	*target++=b;
				958	if(offsets!=NULL) {
				959	*offsets++=sourceIndex++;
				960	}
				961	} else /* AMPERSAND */ {
				962	/* switch to Unicode mode */
				963	nextSourceIndex=++sourceIndex;
				964	inDirectMode=FALSE;
				965	byteIndex=0;
				966	bits=0;
				967	base64Counter=-1;
				968	goto unicodeMode;
				969	}
				970	--length;
				971	}
				972	if(source<sourceLimit && target>=targetLimit) {
				973	/* target is full */
				974	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				975	}
				976	} else {
				977	unicodeMode:
				978	/*
				979	* In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
				980	* The base64 sequence ends with any character that is not in the base64 alphabet.
				981	* A terminating minus sign is consumed.
				982	* US-ASCII must not be base64-ed.
				983	*
				984	* In Unicode Mode, the sourceIndex has the index to the start of the current
				985	* base64 bytes, while nextSourceIndex is precisely parallel to source,
				986	* keeping the index to the following byte.
				987	* Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
				988	*/
				989	while(source<sourceLimit) {
				990	if(target<targetLimit) {
				991	bytes[byteIndex++]=b=*source++;
				992	++nextSourceIndex;
				993	if(b>0x7e) {
				994	/* illegal - test other illegal US-ASCII values by base64Value==-3 */
				995	inDirectMode=TRUE;
				996	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				997	break;
				998	} else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
				999	/* collect base64 bytes into UChars */
				1000	switch(base64Counter) {
				1001	case -1: /* -1 is immediately after the & */
				1002	case 0:
				1003	bits=base64Value;
				1004	base64Counter=1;
				1005	break;
				1006	case 1:
				1007	case 3:
				1008	case 4:
				1009	case 6:
				1010	bits=(uint16_t)((bits<<6)\|base64Value);
				1011	++base64Counter;
				1012	break;
				1013	case 2:
				1014	c=(UChar)((bits<<4)\|(base64Value>>2));
				1015	if(isLegalIMAP(c)) {
				1016	/* illegal */
				1017	inDirectMode=TRUE;
				1018	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1019	goto endloop;
				1020	}
				1021	*target++=c;
				1022	if(offsets!=NULL) {
				1023	*offsets++=sourceIndex;
				1024	sourceIndex=nextSourceIndex-1;
				1025	}
				1026	bytes[0]=b; /* keep this byte in case an error occurs */
				1027	byteIndex=1;
				1028	bits=(uint16_t)(base64Value&3);
				1029	base64Counter=3;
				1030	break;
				1031	case 5:
				1032	c=(UChar)((bits<<2)\|(base64Value>>4));
				1033	if(isLegalIMAP(c)) {
				1034	/* illegal */
				1035	inDirectMode=TRUE;
				1036	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1037	goto endloop;
				1038	}
				1039	*target++=c;
				1040	if(offsets!=NULL) {
				1041	*offsets++=sourceIndex;
				1042	sourceIndex=nextSourceIndex-1;
				1043	}
				1044	bytes[0]=b; /* keep this byte in case an error occurs */
				1045	byteIndex=1;
				1046	bits=(uint16_t)(base64Value&15);
				1047	base64Counter=6;
				1048	break;
				1049	case 7:
				1050	c=(UChar)((bits<<6)\|base64Value);
				1051	if(isLegalIMAP(c)) {
				1052	/* illegal */
				1053	inDirectMode=TRUE;
				1054	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1055	goto endloop;
				1056	}
				1057	*target++=c;
				1058	if(offsets!=NULL) {
				1059	*offsets++=sourceIndex;
				1060	sourceIndex=nextSourceIndex;
				1061	}
				1062	byteIndex=0;
				1063	bits=0;
				1064	base64Counter=0;
				1065	break;
				1066	default:
				1067	/* will never occur */
				1068	break;
				1069	}
				1070	} else if(base64Value==-2) {
				1071	/* minus sign terminates the base64 sequence */
				1072	inDirectMode=TRUE;
				1073	if(base64Counter==-1) {
				1074	/* &- i.e. a minus immediately following an ampersand */
				1075	*target++=AMPERSAND;
				1076	if(offsets!=NULL) {
				1077	*offsets++=sourceIndex-1;
				1078	}
				1079	} else {
				1080	/* absorb the minus and leave the Unicode Mode */
				1081	if(bits!=0 \|\| (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
				1082	/* bits are illegally left over, a UChar is incomplete */
				1083	/* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
				1084	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1085	break;
				1086	}
				1087	}
				1088	sourceIndex=nextSourceIndex;
				1089	goto directMode;
				1090	} else {
				1091	if(base64Counter==-1) {
				1092	/* illegal: & immediately followed by something other than base64 or minus sign */
				1093	/* include the ampersand in the reported sequence */
				1094	--sourceIndex;
				1095	bytes[0]=AMPERSAND;
				1096	bytes[1]=b;
				1097	byteIndex=2;
				1098	}
				1099	/* base64Value==-1 for characters that are illegal only in Unicode mode */
				1100	/* base64Value==-3 for illegal characters */
				1101	/* illegal */
				1102	inDirectMode=TRUE;
				1103	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
				1104	break;
				1105	}
				1106	} else {
				1107	/* target is full */
				1108	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1109	break;
				1110	}
				1111	}
				1112	}
				1113	endloop:
				1114
				1115	/*
				1116	* the end of the input stream and detection of truncated input
				1117	* are handled by the framework, but here we must check if we are in Unicode
				1118	* mode and byteIndex==0 because we must end in direct mode
				1119	*
				1120	* conditions:
				1121	* successful
				1122	* in Unicode mode and byteIndex==0
				1123	* end of input and no truncated input
				1124	*/
				1125	if( U_SUCCESS(*pErrorCode) &&
				1126	!inDirectMode && byteIndex==0 &&
				1127	pArgs->flush && source>=sourceLimit
				1128	) {
				1129	if(base64Counter==-1) {
				1130	/* & at the very end of the input */
				1131	/* make the ampersand the reported sequence */
				1132	bytes[0]=AMPERSAND;
				1133	byteIndex=1;
				1134	}
				1135	/* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
				1136
				1137	inDirectMode=TRUE; /* avoid looping */
				1138	*pErrorCode=U_TRUNCATED_CHAR_FOUND;
				1139	}
				1140
				1141	/* set the converter state back into UConverter */
				1142	cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)\|((uint32_t)((uint8_t)base64Counter)<<16)\|(uint32_t)bits;
				1143	cnv->toULength=byteIndex;
				1144
				1145	/* write back the updated pointers */
				1146	pArgs->source=(const char *)source;
				1147	pArgs->target=target;
				1148	pArgs->offsets=offsets;
				1149	return;
				1150	}
				1151
				1152	static void
				1153	_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
				1154	UErrorCode *pErrorCode) {
				1155	UConverter *cnv;
				1156	const UChar source, sourceLimit;
				1157	uint8_t target, targetLimit;
				1158	int32_t *offsets;
				1159
				1160	int32_t length, targetCapacity, sourceIndex;
				1161	UChar c;
				1162	uint8_t b;
				1163
				1164	/* UTF-7 state */
				1165	uint8_t bits;
				1166	int8_t base64Counter;
				1167	UBool inDirectMode;
				1168
				1169	/* set up the local pointers */
				1170	cnv=pArgs->converter;
				1171
				1172	/* set up the local pointers */
				1173	source=pArgs->source;
				1174	sourceLimit=pArgs->sourceLimit;
				1175	target=(uint8_t *)pArgs->target;
				1176	targetLimit=(uint8_t *)pArgs->targetLimit;
				1177	offsets=pArgs->offsets;
				1178
				1179	/* get the state machine state */
				1180	{
				1181	uint32_t status=cnv->fromUnicodeStatus;
				1182	inDirectMode=(UBool)((status>>24)&1);
				1183	base64Counter=(int8_t)(status>>16);
				1184	bits=(uint8_t)status;
				1185	}
				1186
				1187	/* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
				1188	sourceIndex=0;
				1189
				1190	if(inDirectMode) {
				1191	directMode:
				1192	length=(int32_t)(sourceLimit-source);
				1193	targetCapacity=(int32_t)(targetLimit-target);
				1194	if(length>targetCapacity) {
				1195	length=targetCapacity;
				1196	}
				1197	while(length>0) {
				1198	c=*source++;
				1199	/* encode 0x20..0x7e except '&' directly */
				1200	if(inSetDIMAP(c)) {
				1201	/* encode directly */
				1202	*target++=(uint8_t)c;
				1203	if(offsets!=NULL) {
				1204	*offsets++=sourceIndex++;
				1205	}
				1206	} else if(c==AMPERSAND) {
				1207	/* output &- for & */
				1208	*target++=AMPERSAND;
				1209	if(target<targetLimit) {
				1210	*target++=MINUS;
				1211	if(offsets!=NULL) {
				1212	*offsets++=sourceIndex;
				1213	*offsets++=sourceIndex++;
				1214	}
				1215	/* realign length and targetCapacity */
				1216	goto directMode;
				1217	} else {
				1218	if(offsets!=NULL) {
				1219	*offsets++=sourceIndex++;
				1220	}
				1221	cnv->charErrorBuffer[0]=MINUS;
				1222	cnv->charErrorBufferLength=1;
				1223	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1224	break;
				1225	}
				1226	} else {
				1227	/* un-read this character and switch to Unicode Mode */
				1228	--source;
				1229	*target++=AMPERSAND;
				1230	if(offsets!=NULL) {
				1231	*offsets++=sourceIndex;
				1232	}
				1233	inDirectMode=FALSE;
				1234	base64Counter=0;
				1235	goto unicodeMode;
				1236	}
				1237	--length;
				1238	}
				1239	if(source<sourceLimit && target>=targetLimit) {
				1240	/* target is full */
				1241	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1242	}
				1243	} else {
				1244	unicodeMode:
				1245	while(source<sourceLimit) {
				1246	if(target<targetLimit) {
				1247	c=*source++;
				1248	if(isLegalIMAP(c)) {
				1249	/* encode directly */
				1250	inDirectMode=TRUE;
				1251
				1252	/* trick: back out this character to make this easier */
				1253	--source;
				1254
				1255	/* terminate the base64 sequence */
				1256	if(base64Counter!=0) {
				1257	/* write remaining bits for the previous character */
				1258	*target++=TO_BASE64_IMAP(bits);
				1259	if(offsets!=NULL) {
				1260	*offsets++=sourceIndex-1;
				1261	}
				1262	}
				1263	/* need to terminate with a minus */
				1264	if(target<targetLimit) {
				1265	*target++=MINUS;
				1266	if(offsets!=NULL) {
				1267	*offsets++=sourceIndex-1;
				1268	}
				1269	} else {
				1270	cnv->charErrorBuffer[0]=MINUS;
				1271	cnv->charErrorBufferLength=1;
				1272	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1273	break;
				1274	}
				1275	goto directMode;
				1276	} else {
				1277	/*
				1278	* base64 this character:
				1279	* Output 2 or 3 base64 bytes for the remaining bits of the previous character
				1280	* and the bits of this character, each implicitly in UTF-16BE.
				1281	*
				1282	* Here, bits is an 8-bit variable because only 6 bits need to be kept from one
				1283	* character to the next. The actual 2 or 4 bits are shifted to the left edge
				1284	* of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
				1285	*/
				1286	switch(base64Counter) {
				1287	case 0:
				1288	b=(uint8_t)(c>>10);
				1289	*target++=TO_BASE64_IMAP(b);
				1290	if(target<targetLimit) {
				1291	b=(uint8_t)((c>>4)&0x3f);
				1292	*target++=TO_BASE64_IMAP(b);
				1293	if(offsets!=NULL) {
				1294	*offsets++=sourceIndex;
				1295	*offsets++=sourceIndex++;
				1296	}
				1297	} else {
				1298	if(offsets!=NULL) {
				1299	*offsets++=sourceIndex++;
				1300	}
				1301	b=(uint8_t)((c>>4)&0x3f);
				1302	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1303	cnv->charErrorBufferLength=1;
				1304	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1305	}
				1306	bits=(uint8_t)((c&15)<<2);
				1307	base64Counter=1;
				1308	break;
				1309	case 1:
				1310	b=(uint8_t)(bits\|(c>>14));
				1311	*target++=TO_BASE64_IMAP(b);
				1312	if(target<targetLimit) {
				1313	b=(uint8_t)((c>>8)&0x3f);
				1314	*target++=TO_BASE64_IMAP(b);
				1315	if(target<targetLimit) {
				1316	b=(uint8_t)((c>>2)&0x3f);
				1317	*target++=TO_BASE64_IMAP(b);
				1318	if(offsets!=NULL) {
				1319	*offsets++=sourceIndex;
				1320	*offsets++=sourceIndex;
				1321	*offsets++=sourceIndex++;
				1322	}
				1323	} else {
				1324	if(offsets!=NULL) {
				1325	*offsets++=sourceIndex;
				1326	*offsets++=sourceIndex++;
				1327	}
				1328	b=(uint8_t)((c>>2)&0x3f);
				1329	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1330	cnv->charErrorBufferLength=1;
				1331	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1332	}
				1333	} else {
				1334	if(offsets!=NULL) {
				1335	*offsets++=sourceIndex++;
				1336	}
				1337	b=(uint8_t)((c>>8)&0x3f);
				1338	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1339	b=(uint8_t)((c>>2)&0x3f);
				1340	cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
				1341	cnv->charErrorBufferLength=2;
				1342	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1343	}
				1344	bits=(uint8_t)((c&3)<<4);
				1345	base64Counter=2;
				1346	break;
				1347	case 2:
				1348	b=(uint8_t)(bits\|(c>>12));
				1349	*target++=TO_BASE64_IMAP(b);
				1350	if(target<targetLimit) {
				1351	b=(uint8_t)((c>>6)&0x3f);
				1352	*target++=TO_BASE64_IMAP(b);
				1353	if(target<targetLimit) {
				1354	b=(uint8_t)(c&0x3f);
				1355	*target++=TO_BASE64_IMAP(b);
				1356	if(offsets!=NULL) {
				1357	*offsets++=sourceIndex;
				1358	*offsets++=sourceIndex;
				1359	*offsets++=sourceIndex++;
				1360	}
				1361	} else {
				1362	if(offsets!=NULL) {
				1363	*offsets++=sourceIndex;
				1364	*offsets++=sourceIndex++;
				1365	}
				1366	b=(uint8_t)(c&0x3f);
				1367	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1368	cnv->charErrorBufferLength=1;
				1369	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1370	}
				1371	} else {
				1372	if(offsets!=NULL) {
				1373	*offsets++=sourceIndex++;
				1374	}
				1375	b=(uint8_t)((c>>6)&0x3f);
				1376	cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
				1377	b=(uint8_t)(c&0x3f);
				1378	cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
				1379	cnv->charErrorBufferLength=2;
				1380	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1381	}
				1382	bits=0;
				1383	base64Counter=0;
				1384	break;
				1385	default:
				1386	/* will never occur */
				1387	break;
				1388	}
				1389	}
				1390	} else {
				1391	/* target is full */
				1392	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1393	break;
				1394	}
				1395	}
				1396	}
				1397
				1398	if(pArgs->flush && source>=sourceLimit) {
				1399	/* flush remaining bits to the target */
				1400	if(!inDirectMode) {
				1401	if(base64Counter!=0) {
				1402	if(target<targetLimit) {
				1403	*target++=TO_BASE64_IMAP(bits);
				1404	if(offsets!=NULL) {
				1405	*offsets++=sourceIndex-1;
				1406	}
				1407	} else {
				1408	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
				1409	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1410	}
				1411	}
				1412	/* need to terminate with a minus */
				1413	if(target<targetLimit) {
				1414	*target++=MINUS;
				1415	if(offsets!=NULL) {
				1416	*offsets++=sourceIndex-1;
				1417	}
				1418	} else {
				1419	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
				1420	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
				1421	}
				1422	}
				1423	/* reset the state for the next conversion */
				1424	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)\|0x1000000; /* keep version, inDirectMode=TRUE */
				1425	} else {
				1426	/* set the converter state back into UConverter */
				1427	cnv->fromUnicodeStatus=
				1428	(cnv->fromUnicodeStatus&0xf0000000)\| /* keep version*/
				1429	((uint32_t)inDirectMode<<24)\|((uint32_t)base64Counter<<16)\|(uint32_t)bits;
				1430	}
				1431
				1432	/* write back the updated pointers */
				1433	pArgs->source=source;
				1434	pArgs->target=(char *)target;
				1435	pArgs->offsets=offsets;
				1436	return;
				1437	}
				1438
				1439	static const UConverterImpl _IMAPImpl={
				1440	UCNV_IMAP_MAILBOX,
				1441
				1442	NULL,
				1443	NULL,
				1444
				1445	_UTF7Open,
				1446	NULL,
				1447	_UTF7Reset,
				1448
				1449	_IMAPToUnicodeWithOffsets,
				1450	_IMAPToUnicodeWithOffsets,
				1451	_IMAPFromUnicodeWithOffsets,
				1452	_IMAPFromUnicodeWithOffsets,
				1453	NULL,
				1454
				1455	NULL,
				1456	NULL,
				1457	NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
				1458	NULL,
				1459	ucnv_getCompleteUnicodeSet
				1460	};
				1461
				1462	static const UConverterStaticData _IMAPStaticData={
				1463	sizeof(UConverterStaticData),
				1464	"IMAP-mailbox-name",
				1465	0, /* TODO CCSID for IMAP-mailbox-name */
				1466	UCNV_IBM, UCNV_IMAP_MAILBOX,
				1467	1, 4,
				1468	{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
				1469	FALSE, FALSE,
				1470	0,
				1471	0,
				1472	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
				1473	};
				1474
Jungshik Shin	a05f412	2015-06-09 15:33:54 -0700	[diff] [blame]	1475	const UConverterSharedData _IMAPData=
				1476	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1477
				1478	#endif