Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: 72944c8120f8a94b049ad6f353ec201cd8f4a9d2 [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
danielk1977	e7d00f5	2004-05-29 02:44:02 +0000	[diff] [blame^]	15	** $Id: utf.c,v 1.14 2004/05/29 02:44:02 danielk1977 Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
drh	51846b5	2004-05-28 16:00:21 +0000	[diff] [blame]	28	** Word-0 Word-1 Value
				29	** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				30	** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	51	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	52	#include <assert.h>
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	53	#include "sqliteInt.h"
				54
				55	typedef struct UtfString UtfString;
				56	struct UtfString {
				57	unsigned char pZ; / Raw string data */
				58	int n; /* Allocated length of pZ in bytes */
				59	int c; /* Number of pZ bytes already read or written */
				60	};
				61
				62	/* TODO: Implement this macro in os.h. It should be 1 on big-endian
				63	** machines, and 0 on little-endian.
				64	*/
				65	#define SQLITE3_NATIVE_BIGENDIAN 0
				66
				67	#if SQLITE3_NATIVE_BIGENDIAN == 1
				68	#define BOM_BIGENDIAN 0x0000FFFE
				69	#define BOM_LITTLEENDIAN 0x0000FEFF
				70	#else
				71	#define BOM_BIGENDIAN 0x0000FEFF
				72	#define BOM_LITTLEENDIAN 0x0000FFFE
				73	#endif
				74
				75	/*
				76	** These two macros are used to interpret the first two bytes of the
				77	** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
				78	** interpretation, LE16() for little-endian.
				79	*/
				80	#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
				81	#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
				82
				83	/*
				84	** READ_16 interprets the first two bytes of the unsigned char array pZ
				85	** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
				86	** is big-endian, otherwise little-endian.
				87	*/
				88	#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
				89
				90	/*
				91	** Read the BOM from the start of *pStr, if one is present. Return zero
				92	** for little-endian, non-zero for big-endian. If no BOM is present, return
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	93	** the value of the parameter "big_endian".
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	94	**
				95	** Return values:
				96	** 1 -> big-endian string
				97	** 0 -> little-endian string
				98	*/
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	99	static int readUtf16Bom(UtfString *pStr, int big_endian){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	100	/* The BOM must be the first thing read from the string */
				101	assert( pStr->c==0 );
				102
				103	/* If the string data consists of 1 byte or less, the BOM will make no
				104	** difference anyway. In this case just fall through to the default case
				105	** and return the native byte-order for this machine.
				106	**
				107	** Otherwise, check the first 2 bytes of the string to see if a BOM is
				108	** present.
				109	*/
				110	if( pStr->n>1 ){
				111	u32 bom = BE16(pStr->pZ);
				112	if( bom==BOM_BIGENDIAN ){
				113	pStr->c = 2;
				114	return 1;
				115	}
				116	if( bom==BOM_LITTLEENDIAN ){
				117	pStr->c = 2;
				118	return 0;
				119	}
				120	}
				121
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	122	return big_endian;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	123	}
				124
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	125	/*
				126	** zData is a UTF-16 encoded string, nData bytes in length. This routine
				127	** checks if there is a byte-order mark at the start of zData. If no
				128	** byte order mark is found 0 is returned. Otherwise TEXT_Utf16be or
				129	** TEXT_Utf16le is returned, depending on whether The BOM indicates that
				130	** the text is big-endian or little-endian.
				131	*/
				132	u8 sqlite3UtfReadBom(const void *zData, int nData){
				133	if( nData<0 \|\| nData>1 ){
				134	u8 b1 = (u8 )zData;
				135	u8 b2 = (((u8 )zData) + 1);
				136	if( b1==0xFE && b2==0xFF ){
				137	return TEXT_Utf16be;
				138	}
				139	if( b1==0xFF && b2==0xFE ){
				140	return TEXT_Utf16le;
				141	}
				142	}
				143	return 0;
				144	}
				145
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	146
				147	/*
				148	** Read a single unicode character from the UTF-8 encoded string *pStr. The
				149	** value returned is a unicode scalar value. In the case of malformed
				150	** strings, the unicode replacement character U+FFFD may be returned.
				151	*/
				152	static u32 readUtf8(UtfString *pStr){
				153	struct Utf8TblRow {
				154	u8 b1_mask;
				155	u8 b1_masked_val;
				156	u8 b1_value_mask;
				157	int trailing_bytes;
				158	};
				159	static const struct Utf8TblRow utf8tbl[] = {
				160	{ 0x80, 0x00, 0x7F, 0 },
				161	{ 0xE0, 0xC0, 0x1F, 1 },
				162	{ 0xF0, 0xE0, 0x0F, 2 },
				163	{ 0xF8, 0xF0, 0x0E, 3 },
				164	{ 0, 0, 0, 0}
				165	};
				166
				167	u8 b1; /* First byte of the potentially multi-byte utf-8 character */
				168	u32 ret = 0; /* Return value */
				169	int ii;
				170	struct Utf8TblRow const *pRow;
				171
				172	pRow = &(utf8tbl[0]);
				173
				174	b1 = pStr->pZ[pStr->c];
				175	pStr->c++;
				176	while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
				177	pRow++;
				178	}
				179	if( !pRow->b1_mask ){
				180	return 0xFFFD;
				181	}
				182
				183	ret = (u32)(b1&pRow->b1_value_mask);
				184	for( ii=0; ii<pRow->trailing_bytes; ii++ ){
				185	u8 b = pStr->pZ[pStr->c+ii];
				186	if( (b&0xC0)!=0x80 ){
				187	return 0xFFFD;
				188	}
				189	ret = (ret<<6) + (u32)(b&0x3F);
				190	}
				191
				192	pStr->c += pRow->trailing_bytes;
				193	return ret;
				194	}
				195
				196	/*
				197	** Write the unicode character 'code' to the string pStr using UTF-8
				198	** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
				199	*/
				200	static int writeUtf8(UtfString *pStr, u32 code){
				201	struct Utf8WriteTblRow {
				202	u32 max_code;
				203	int trailing_bytes;
				204	u8 b1_and_mask;
				205	u8 b1_or_mask;
				206	};
				207	static const struct Utf8WriteTblRow utf8tbl[] = {
				208	{0x0000007F, 0, 0x7F, 0x00},
				209	{0x000007FF, 1, 0xDF, 0xC0},
				210	{0x0000FFFF, 2, 0xEF, 0xE0},
				211	{0x0010FFFF, 3, 0xF7, 0xF0},
				212	{0x00000000, 0, 0x00, 0x00}
				213	};
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	214	const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	215
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	216	while( code>pRow->max_code ){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	217	assert( pRow->max_code );
				218	pRow++;
				219	}
				220
				221	/* Ensure there is enough room left in the output buffer to write
				222	** this UTF-8 character.
				223	*/
				224	assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
				225
				226	/* Write the UTF-8 encoded character to pStr. All cases below are
				227	** intentionally fall-through.
				228	*/
				229	switch( pRow->trailing_bytes ){
				230	case 3:
				231	pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)\|0x80;
				232	code = code>>6;
				233	case 2:
				234	pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)\|0x80;
				235	code = code>>6;
				236	case 1:
				237	pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)\|0x80;
				238	code = code>>6;
				239	case 0:
				240	pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))\|(pRow->b1_or_mask);
				241	}
				242	pStr->c += (pRow->trailing_bytes + 1);
				243
				244	return 0;
				245	}
				246
				247	/*
				248	** Read a single unicode character from the UTF-16 encoded string *pStr. The
				249	** value returned is a unicode scalar value. In the case of malformed
				250	** strings, the unicode replacement character U+FFFD may be returned.
				251	**
				252	** If big_endian is true, the string is assumed to be UTF-16BE encoded.
				253	** Otherwise, it is UTF-16LE encoded.
				254	*/
				255	static u32 readUtf16(UtfString *pStr, int big_endian){
				256	u32 code_point; /* the first code-point in the character */
				257
				258	/* If there is only one byte of data left in the string, return the
				259	** replacement character.
				260	*/
				261	if( (pStr->n-pStr->c)==1 ){
				262	pStr->c++;
				263	return (int)0xFFFD;
				264	}
				265
				266	code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
				267	pStr->c += 2;
				268
				269	/* If this is a non-surrogate code-point, just cast it to an int and
				270	** return the code-point value.
				271	*/
				272	if( code_point<0xD800 \|\| code_point>0xE000 ){
				273	return code_point;
				274	}
				275
				276	/* If this is a trailing surrogate code-point, then the string is
				277	** malformed; return the replacement character.
				278	*/
				279	if( code_point>0xDBFF ){
				280	return 0xFFFD;
				281	}
				282
				283	/* The code-point just read is a leading surrogate code-point. If their
				284	** is not enough data left or the next code-point is not a trailing
				285	** surrogate, return the replacement character.
				286	*/
				287	if( (pStr->n-pStr->c)>1 ){
				288	u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
				289	if( code_point2<0xDC00 \|\| code_point>0xDFFF ){
				290	return 0xFFFD;
				291	}
				292	pStr->c += 2;
				293
				294	return (
				295	(((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
				296	((code_point&0x003F)<<10) + /* xxxxxx */
				297	(code_point2&0x03FF) /* yy yyyyyyyy */
				298	);
				299
				300	}else{
				301	return (int)0xFFFD;
				302	}
				303
				304	/* not reached */
				305	}
				306
				307	static int writeUtf16(UtfString *pStr, int code, int big_endian){
				308	int bytes;
				309	unsigned char *hi_byte;
				310	unsigned char *lo_byte;
				311
				312	bytes = (code>0x0000FFFF?4:2);
				313
				314	/* Ensure there is enough room left in the output buffer to write
				315	** this UTF-8 character.
				316	*/
				317	assert( (pStr->n-pStr->c)>=bytes );
				318
				319	/* Initialise hi_byte and lo_byte to point at the locations into which
				320	** the MSB and LSB of the (first) 16-bit unicode code-point written for
				321	** this character.
				322	*/
				323	hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
				324	lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
				325
				326	if( bytes==2 ){
				327	*hi_byte = (u8)((code&0x0000FF00)>>8);
				328	*lo_byte = (u8)(code&0x000000FF);
				329	}else{
				330	u32 wrd;
				331	wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)\|0x0000D800;
				332	*hi_byte = (u8)((wrd&0x0000FF00)>>8);
				333	*lo_byte = (u8)(wrd&0x000000FF);
				334
				335	wrd = (code&0x000003FF)\|0x0000DC00;
				336	*(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
				337	*(lo_byte+2) = (u8)(wrd&0x000000FF);
				338	}
				339
				340	pStr->c += bytes;
				341
				342	return 0;
				343	}
				344
				345	/*
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	346	** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
				347	** return the number of unicode characters in pZ up to (but not including)
				348	** the first 0x00 byte. If nByte is not less than zero, return the
				349	** number of unicode characters in the first nByte of pZ (or up to
				350	** the first 0x00, whichever comes first).
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	351	*/
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	352	int sqlite3utf8CharLen(const char *pZ, int nByte){
				353	UtfString str;
				354	int ret = 0;
				355	u32 code = 1;
				356
				357	str.pZ = (char *)pZ;
				358	str.n = nByte;
				359	str.c = 0;
				360
				361	while( (nByte<0 \|\| str.c<str.n) && code!=0 ){
				362	code = readUtf8(&str);
				363	ret++;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	364	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	365	if( code==0 ) ret--;
				366
				367	return ret;
				368	}
				369
				370	/*
				371	** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
				372	** return the number of bytes up to (but not including), the first pair
				373	** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
				374	** then return the number of bytes in the first nChar unicode characters
				375	** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
				376	*/
				377	int sqlite3utf16ByteLen(const void *pZ, int nChar){
				378	if( nChar<0 ){
danielk1977	e7d00f5	2004-05-29 02:44:02 +0000	[diff] [blame^]	379	const unsigned char pC1 = (unsigned char )pZ;
				380	const unsigned char pC2 = (unsigned char )pZ+1;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	381	while( pC1 \|\| pC2 ){
				382	pC1 += 2;
				383	pC2 += 2;
				384	}
				385	return pC1-(unsigned char *)pZ;
				386	}else{
				387	UtfString str;
				388	u32 code = 1;
				389	int big_endian;
				390	int nRead = 0;
				391	int ret;
				392
				393	str.pZ = (char *)pZ;
				394	str.c = 0;
				395	str.n = -1;
				396
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	397	/* Check for a BOM. We just ignore it if there is one, it's only read
				398	** so that it is not counted as a character.
				399	*/
				400	big_endian = readUtf16Bom(&str, 0);
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	401	ret = 0-str.c;
				402
				403	while( code!=0 && nRead<nChar ){
				404	code = readUtf16(&str, big_endian);
				405	nRead++;
				406	}
				407	if( code==0 ){
				408	ret -= 2;
				409	}
				410	return str.c + ret;
				411	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	412	}
				413
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	414	/*
				415	** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
				416	** "BOM") into a UTF-8 string. The UTF-8 string is written into space
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	417	** obtained from sqlite3Malloc() and must be released by the calling function.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	418	**
				419	** The parameter N is the number of bytes in the UTF-16 string. If N is
				420	** negative, the entire string up to the first \u0000 character is translated.
				421	**
				422	** The returned UTF-8 string is always \000 terminated.
				423	*/
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	424	unsigned char sqlite3utf16to8(const void pData, int N, int big_endian){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	425	UtfString in;
				426	UtfString out;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	427
				428	out.pZ = 0;
				429
				430	in.pZ = (unsigned char *)pData;
				431	in.n = N;
				432	in.c = 0;
				433
				434	if( in.n<0 ){
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	435	in.n = sqlite3utf16ByteLen(in.pZ, -1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	436	}
				437
				438	/* A UTF-8 encoding of a unicode string can require at most 1.5 times as
				439	** much space to store as the same string encoded using UTF-16. Allocate
				440	** this now.
				441	*/
				442	out.n = (in.n*1.5) + 1;
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	443	out.pZ = sqliteMalloc(out.n);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	444	if( !out.pZ ){
				445	return 0;
				446	}
				447	out.c = 0;
				448
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	449	big_endian = readUtf16Bom(&in, big_endian);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	450	while( in.c<in.n ){
				451	writeUtf8(&out, readUtf16(&in, big_endian));
				452	}
				453
				454	/* Add the NULL-terminator character */
				455	assert( out.c<out.n );
				456	out.pZ[out.c] = 0x00;
				457
				458	return out.pZ;
				459	}
				460
				461	static void utf8toUtf16(const unsigned char pIn, int N, int big_endian){
				462	UtfString in;
				463	UtfString out;
				464
				465	in.pZ = (unsigned char *)pIn;
				466	in.n = N;
				467	in.c = 0;
				468
				469	if( in.n<0 ){
				470	in.n = strlen(in.pZ);
				471	}
				472
				473	/* A UTF-16 encoding of a unicode string can require at most twice as
				474	** much space to store as the same string encoded using UTF-8. Allocate
				475	** this now.
				476	*/
				477	out.n = (in.n*2) + 2;
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	478	out.pZ = sqliteMalloc(out.n);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	479	if( !out.pZ ){
				480	return 0;
				481	}
				482	out.c = 0;
				483
				484	while( in.c<in.n ){
				485	writeUtf16(&out, readUtf8(&in), big_endian);
				486	}
				487
				488	/* Add the NULL-terminator character */
				489	assert( (out.c+1)<out.n );
				490	out.pZ[out.c] = 0x00;
				491	out.pZ[out.c+1] = 0x00;
				492
				493	return out.pZ;
				494	}
				495
				496	/*
				497	** Translate UTF-8 to UTF-16BE or UTF-16LE
				498	*/
				499	void sqlite3utf8to16be(const unsigned char pIn, int N){
				500	return utf8toUtf16(pIn, N, 1);
				501	}
				502
				503	void sqlite3utf8to16le(const unsigned char pIn, int N){
				504	return utf8toUtf16(pIn, N, 0);
				505	}
				506
				507	/*
				508	** This routine does the work for sqlite3utf16to16le() and
				509	** sqlite3utf16to16be(). If big_endian is 1 the input string is
				510	** transformed in place to UTF-16BE encoding. If big_endian is 0 then
				511	** the input is transformed to UTF-16LE.
				512	**
				513	** Unless the first two bytes of the input string is a BOM, the input is
				514	** assumed to be UTF-16 encoded using the machines native byte ordering.
				515	*/
				516	static void utf16to16(void *pData, int N, int big_endian){
				517	UtfString inout;
				518	inout.pZ = (unsigned char *)pData;
				519	inout.c = 0;
				520	inout.n = N;
				521
				522	if( inout.n<0 ){
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	523	inout.n = sqlite3utf16ByteLen(inout.pZ, -1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	524	}
				525
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	526	if( readUtf16Bom(&inout, SQLITE3_BIGENDIAN)!=big_endian ){
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	527	/* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */
				528	int i;
				529	for(i=0; i<(inout.n-inout.c); i += 2){
				530	char c1 = inout.pZ[i+inout.c];
				531	char c2 = inout.pZ[i+inout.c+1];
				532	inout.pZ[i] = c2;
				533	inout.pZ[i+1] = c1;
				534	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	535	}else if( inout.c ){
				536	memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
				537	}
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	538
				539	inout.pZ[inout.n-inout.c] = 0x00;
				540	inout.pZ[inout.n-inout.c+1] = 0x00;
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	541	}
				542
				543	/*
				544	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
				545	** string. The conversion occurs in-place. The output overwrites the
				546	** input. N bytes are converted. If N is negative everything is converted
				547	** up to the first \u0000 character.
				548	**
				549	** If the native byte order is little-endian and there is no BOM, then
				550	** this routine is a no-op. If there is a BOM at the start of the string,
				551	** it is removed.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	552	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	553	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				554	** using the library function swab().
				555	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	556	void sqlite3utf16to16le(void *pData, int N){
				557	utf16to16(pData, N, 0);
				558	}
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	559
				560	/*
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	561	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
				562	** string. The conversion occurs in-place. The output overwrites the
				563	** input. N bytes are converted. If N is negative everything is converted
				564	** up to the first \u0000 character.
				565	**
				566	** If the native byte order is little-endian and there is no BOM, then
				567	** this routine is a no-op. If there is a BOM at the start of the string,
				568	** it is removed.
				569	**
				570	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				571	** using the library function swab().
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	572	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	573	void sqlite3utf16to16be(void *pData, int N){
				574	utf16to16(pData, N, 1);
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	575	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	576
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	577	/*
				578	** This function is used to translate between UTF-8 and UTF-16. The
				579	** result is returned in dynamically allocated memory.
				580	*/
				581	int sqlite3utfTranslate(
drh	eb2e176	2004-05-27 01:53:56 +0000	[diff] [blame]	582	const void zData, int nData, / Input string */
				583	u8 enc1, /* Encoding of zData */
				584	void *zOut, int nOut, /* Output string */
				585	u8 enc2 /* Desired encoding of output */
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	586	){
				587	assert( enc1==TEXT_Utf8 \|\| enc1==TEXT_Utf16le \|\| enc1==TEXT_Utf16be );
				588	assert( enc2==TEXT_Utf8 \|\| enc2==TEXT_Utf16le \|\| enc2==TEXT_Utf16be );
				589	assert(
				590	(enc1==TEXT_Utf8 && (enc2==TEXT_Utf16le \|\| enc2==TEXT_Utf16be)) \|\|
				591	(enc2==TEXT_Utf8 && (enc1==TEXT_Utf16le \|\| enc1==TEXT_Utf16be))
				592	);
danielk1977	4adee20	2004-05-08 08:23:19 +0000	[diff] [blame]	593
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	594	if( enc1==TEXT_Utf8 ){
				595	if( enc2==TEXT_Utf16le ){
				596	*zOut = sqlite3utf8to16le(zData, nData);
				597	}else{
				598	*zOut = sqlite3utf8to16be(zData, nData);
				599	}
				600	if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977	c572ef7	2004-05-27 09:28:41 +0000	[diff] [blame]	601	nOut = sqlite3utf16ByteLen(zOut, -1);
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	602	}else{
				603	*zOut = sqlite3utf16to8(zData, nData, enc1==TEXT_Utf16be);
				604	if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977	c572ef7	2004-05-27 09:28:41 +0000	[diff] [blame]	605	nOut = strlen(zOut);
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	606	}
				607	return SQLITE_OK;
				608	}