Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: 65dd05e4a3981e5dba864cda0e2289e7326d9eba [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
danielk1977	193c72f	2004-06-02 00:29:24 +0000	[diff] [blame^]	15	** $Id: utf.c,v 1.16 2004/06/02 00:29:24 danielk1977 Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
drh	51846b5	2004-05-28 16:00:21 +0000	[diff] [blame]	28	** Word-0 Word-1 Value
				29	** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				30	** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	51	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	52	#include <assert.h>
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	53	#include "sqliteInt.h"
				54
				55	typedef struct UtfString UtfString;
				56	struct UtfString {
				57	unsigned char pZ; / Raw string data */
				58	int n; /* Allocated length of pZ in bytes */
				59	int c; /* Number of pZ bytes already read or written */
				60	};
				61
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	62	/*
				63	** These two macros are used to interpret the first two bytes of the
				64	** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
				65	** interpretation, LE16() for little-endian.
				66	*/
				67	#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
				68	#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
				69
				70	/*
				71	** READ_16 interprets the first two bytes of the unsigned char array pZ
				72	** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
				73	** is big-endian, otherwise little-endian.
				74	*/
				75	#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
				76
				77	/*
				78	** Read the BOM from the start of *pStr, if one is present. Return zero
				79	** for little-endian, non-zero for big-endian. If no BOM is present, return
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	80	** the value of the parameter "big_endian".
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	81	**
				82	** Return values:
				83	** 1 -> big-endian string
				84	** 0 -> little-endian string
				85	*/
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	86	static int readUtf16Bom(UtfString *pStr, int big_endian){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	87	/* The BOM must be the first thing read from the string */
				88	assert( pStr->c==0 );
				89
				90	/* If the string data consists of 1 byte or less, the BOM will make no
				91	** difference anyway. In this case just fall through to the default case
				92	** and return the native byte-order for this machine.
				93	**
				94	** Otherwise, check the first 2 bytes of the string to see if a BOM is
				95	** present.
				96	*/
				97	if( pStr->n>1 ){
danielk1977	193c72f	2004-06-02 00:29:24 +0000	[diff] [blame^]	98	u8 bom = sqlite3UtfReadBom(pStr->pZ, 2);
				99	if( bom ){
				100	pStr->c += 2;
				101	return (bom==TEXT_Utf16le)?0:1;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	102	}
				103	}
				104
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	105	return big_endian;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	106	}
				107
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	108	/*
				109	** zData is a UTF-16 encoded string, nData bytes in length. This routine
				110	** checks if there is a byte-order mark at the start of zData. If no
				111	** byte order mark is found 0 is returned. Otherwise TEXT_Utf16be or
				112	** TEXT_Utf16le is returned, depending on whether The BOM indicates that
				113	** the text is big-endian or little-endian.
				114	*/
				115	u8 sqlite3UtfReadBom(const void *zData, int nData){
				116	if( nData<0 \|\| nData>1 ){
				117	u8 b1 = (u8 )zData;
				118	u8 b2 = (((u8 )zData) + 1);
				119	if( b1==0xFE && b2==0xFF ){
				120	return TEXT_Utf16be;
				121	}
				122	if( b1==0xFF && b2==0xFE ){
				123	return TEXT_Utf16le;
				124	}
				125	}
				126	return 0;
				127	}
				128
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	129
				130	/*
				131	** Read a single unicode character from the UTF-8 encoded string *pStr. The
				132	** value returned is a unicode scalar value. In the case of malformed
				133	** strings, the unicode replacement character U+FFFD may be returned.
				134	*/
				135	static u32 readUtf8(UtfString *pStr){
				136	struct Utf8TblRow {
				137	u8 b1_mask;
				138	u8 b1_masked_val;
				139	u8 b1_value_mask;
				140	int trailing_bytes;
				141	};
				142	static const struct Utf8TblRow utf8tbl[] = {
				143	{ 0x80, 0x00, 0x7F, 0 },
				144	{ 0xE0, 0xC0, 0x1F, 1 },
				145	{ 0xF0, 0xE0, 0x0F, 2 },
				146	{ 0xF8, 0xF0, 0x0E, 3 },
				147	{ 0, 0, 0, 0}
				148	};
				149
				150	u8 b1; /* First byte of the potentially multi-byte utf-8 character */
				151	u32 ret = 0; /* Return value */
				152	int ii;
				153	struct Utf8TblRow const *pRow;
				154
				155	pRow = &(utf8tbl[0]);
				156
				157	b1 = pStr->pZ[pStr->c];
				158	pStr->c++;
				159	while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
				160	pRow++;
				161	}
				162	if( !pRow->b1_mask ){
				163	return 0xFFFD;
				164	}
				165
				166	ret = (u32)(b1&pRow->b1_value_mask);
				167	for( ii=0; ii<pRow->trailing_bytes; ii++ ){
				168	u8 b = pStr->pZ[pStr->c+ii];
				169	if( (b&0xC0)!=0x80 ){
				170	return 0xFFFD;
				171	}
				172	ret = (ret<<6) + (u32)(b&0x3F);
				173	}
				174
				175	pStr->c += pRow->trailing_bytes;
				176	return ret;
				177	}
				178
				179	/*
				180	** Write the unicode character 'code' to the string pStr using UTF-8
				181	** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
				182	*/
				183	static int writeUtf8(UtfString *pStr, u32 code){
				184	struct Utf8WriteTblRow {
				185	u32 max_code;
				186	int trailing_bytes;
				187	u8 b1_and_mask;
				188	u8 b1_or_mask;
				189	};
				190	static const struct Utf8WriteTblRow utf8tbl[] = {
				191	{0x0000007F, 0, 0x7F, 0x00},
				192	{0x000007FF, 1, 0xDF, 0xC0},
				193	{0x0000FFFF, 2, 0xEF, 0xE0},
				194	{0x0010FFFF, 3, 0xF7, 0xF0},
				195	{0x00000000, 0, 0x00, 0x00}
				196	};
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	197	const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	198
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	199	while( code>pRow->max_code ){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	200	assert( pRow->max_code );
				201	pRow++;
				202	}
				203
				204	/* Ensure there is enough room left in the output buffer to write
				205	** this UTF-8 character.
				206	*/
				207	assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
				208
				209	/* Write the UTF-8 encoded character to pStr. All cases below are
				210	** intentionally fall-through.
				211	*/
				212	switch( pRow->trailing_bytes ){
				213	case 3:
				214	pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)\|0x80;
				215	code = code>>6;
				216	case 2:
				217	pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)\|0x80;
				218	code = code>>6;
				219	case 1:
				220	pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)\|0x80;
				221	code = code>>6;
				222	case 0:
				223	pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))\|(pRow->b1_or_mask);
				224	}
				225	pStr->c += (pRow->trailing_bytes + 1);
				226
				227	return 0;
				228	}
				229
				230	/*
				231	** Read a single unicode character from the UTF-16 encoded string *pStr. The
				232	** value returned is a unicode scalar value. In the case of malformed
				233	** strings, the unicode replacement character U+FFFD may be returned.
				234	**
				235	** If big_endian is true, the string is assumed to be UTF-16BE encoded.
				236	** Otherwise, it is UTF-16LE encoded.
				237	*/
				238	static u32 readUtf16(UtfString *pStr, int big_endian){
				239	u32 code_point; /* the first code-point in the character */
				240
				241	/* If there is only one byte of data left in the string, return the
				242	** replacement character.
				243	*/
				244	if( (pStr->n-pStr->c)==1 ){
				245	pStr->c++;
				246	return (int)0xFFFD;
				247	}
				248
				249	code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
				250	pStr->c += 2;
				251
				252	/* If this is a non-surrogate code-point, just cast it to an int and
				253	** return the code-point value.
				254	*/
				255	if( code_point<0xD800 \|\| code_point>0xE000 ){
				256	return code_point;
				257	}
				258
				259	/* If this is a trailing surrogate code-point, then the string is
				260	** malformed; return the replacement character.
				261	*/
				262	if( code_point>0xDBFF ){
				263	return 0xFFFD;
				264	}
				265
				266	/* The code-point just read is a leading surrogate code-point. If their
				267	** is not enough data left or the next code-point is not a trailing
				268	** surrogate, return the replacement character.
				269	*/
				270	if( (pStr->n-pStr->c)>1 ){
				271	u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
				272	if( code_point2<0xDC00 \|\| code_point>0xDFFF ){
				273	return 0xFFFD;
				274	}
				275	pStr->c += 2;
				276
				277	return (
				278	(((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
				279	((code_point&0x003F)<<10) + /* xxxxxx */
				280	(code_point2&0x03FF) /* yy yyyyyyyy */
				281	);
				282
				283	}else{
				284	return (int)0xFFFD;
				285	}
				286
				287	/* not reached */
				288	}
				289
				290	static int writeUtf16(UtfString *pStr, int code, int big_endian){
				291	int bytes;
				292	unsigned char *hi_byte;
				293	unsigned char *lo_byte;
				294
				295	bytes = (code>0x0000FFFF?4:2);
				296
				297	/* Ensure there is enough room left in the output buffer to write
				298	** this UTF-8 character.
				299	*/
				300	assert( (pStr->n-pStr->c)>=bytes );
				301
				302	/* Initialise hi_byte and lo_byte to point at the locations into which
				303	** the MSB and LSB of the (first) 16-bit unicode code-point written for
				304	** this character.
				305	*/
				306	hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
				307	lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
				308
				309	if( bytes==2 ){
				310	*hi_byte = (u8)((code&0x0000FF00)>>8);
				311	*lo_byte = (u8)(code&0x000000FF);
				312	}else{
				313	u32 wrd;
				314	wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)\|0x0000D800;
				315	*hi_byte = (u8)((wrd&0x0000FF00)>>8);
				316	*lo_byte = (u8)(wrd&0x000000FF);
				317
				318	wrd = (code&0x000003FF)\|0x0000DC00;
				319	*(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
				320	*(lo_byte+2) = (u8)(wrd&0x000000FF);
				321	}
				322
				323	pStr->c += bytes;
				324
				325	return 0;
				326	}
				327
				328	/*
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	329	** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
				330	** return the number of unicode characters in pZ up to (but not including)
				331	** the first 0x00 byte. If nByte is not less than zero, return the
				332	** number of unicode characters in the first nByte of pZ (or up to
				333	** the first 0x00, whichever comes first).
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	334	*/
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	335	int sqlite3utf8CharLen(const char *pZ, int nByte){
				336	UtfString str;
				337	int ret = 0;
				338	u32 code = 1;
				339
				340	str.pZ = (char *)pZ;
				341	str.n = nByte;
				342	str.c = 0;
				343
				344	while( (nByte<0 \|\| str.c<str.n) && code!=0 ){
				345	code = readUtf8(&str);
				346	ret++;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	347	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	348	if( code==0 ) ret--;
				349
				350	return ret;
				351	}
				352
				353	/*
				354	** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
				355	** return the number of bytes up to (but not including), the first pair
				356	** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
				357	** then return the number of bytes in the first nChar unicode characters
				358	** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
				359	*/
				360	int sqlite3utf16ByteLen(const void *pZ, int nChar){
				361	if( nChar<0 ){
danielk1977	e7d00f5	2004-05-29 02:44:02 +0000	[diff] [blame]	362	const unsigned char pC1 = (unsigned char )pZ;
				363	const unsigned char pC2 = (unsigned char )pZ+1;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	364	while( pC1 \|\| pC2 ){
				365	pC1 += 2;
				366	pC2 += 2;
				367	}
				368	return pC1-(unsigned char *)pZ;
				369	}else{
				370	UtfString str;
				371	u32 code = 1;
				372	int big_endian;
				373	int nRead = 0;
				374	int ret;
				375
				376	str.pZ = (char *)pZ;
				377	str.c = 0;
				378	str.n = -1;
				379
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	380	/* Check for a BOM. We just ignore it if there is one, it's only read
				381	** so that it is not counted as a character.
				382	*/
				383	big_endian = readUtf16Bom(&str, 0);
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	384	ret = 0-str.c;
				385
				386	while( code!=0 && nRead<nChar ){
				387	code = readUtf16(&str, big_endian);
				388	nRead++;
				389	}
				390	if( code==0 ){
				391	ret -= 2;
				392	}
				393	return str.c + ret;
				394	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	395	}
				396
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	397	/*
				398	** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
				399	** "BOM") into a UTF-8 string. The UTF-8 string is written into space
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	400	** obtained from sqlite3Malloc() and must be released by the calling function.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	401	**
				402	** The parameter N is the number of bytes in the UTF-16 string. If N is
				403	** negative, the entire string up to the first \u0000 character is translated.
				404	**
				405	** The returned UTF-8 string is always \000 terminated.
				406	*/
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	407	unsigned char sqlite3utf16to8(const void pData, int N, int big_endian){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	408	UtfString in;
				409	UtfString out;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	410
				411	out.pZ = 0;
				412
				413	in.pZ = (unsigned char *)pData;
				414	in.n = N;
				415	in.c = 0;
				416
				417	if( in.n<0 ){
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	418	in.n = sqlite3utf16ByteLen(in.pZ, -1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	419	}
				420
				421	/* A UTF-8 encoding of a unicode string can require at most 1.5 times as
				422	** much space to store as the same string encoded using UTF-16. Allocate
				423	** this now.
				424	*/
				425	out.n = (in.n*1.5) + 1;
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	426	out.pZ = sqliteMalloc(out.n);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	427	if( !out.pZ ){
				428	return 0;
				429	}
				430	out.c = 0;
				431
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	432	big_endian = readUtf16Bom(&in, big_endian);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	433	while( in.c<in.n ){
				434	writeUtf8(&out, readUtf16(&in, big_endian));
				435	}
				436
				437	/* Add the NULL-terminator character */
				438	assert( out.c<out.n );
				439	out.pZ[out.c] = 0x00;
				440
				441	return out.pZ;
				442	}
				443
				444	static void utf8toUtf16(const unsigned char pIn, int N, int big_endian){
				445	UtfString in;
				446	UtfString out;
				447
				448	in.pZ = (unsigned char *)pIn;
				449	in.n = N;
				450	in.c = 0;
				451
				452	if( in.n<0 ){
				453	in.n = strlen(in.pZ);
				454	}
				455
				456	/* A UTF-16 encoding of a unicode string can require at most twice as
				457	** much space to store as the same string encoded using UTF-8. Allocate
				458	** this now.
				459	*/
				460	out.n = (in.n*2) + 2;
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	461	out.pZ = sqliteMalloc(out.n);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	462	if( !out.pZ ){
				463	return 0;
				464	}
				465	out.c = 0;
				466
				467	while( in.c<in.n ){
				468	writeUtf16(&out, readUtf8(&in), big_endian);
				469	}
				470
				471	/* Add the NULL-terminator character */
				472	assert( (out.c+1)<out.n );
				473	out.pZ[out.c] = 0x00;
				474	out.pZ[out.c+1] = 0x00;
				475
				476	return out.pZ;
				477	}
				478
				479	/*
				480	** Translate UTF-8 to UTF-16BE or UTF-16LE
				481	*/
				482	void sqlite3utf8to16be(const unsigned char pIn, int N){
				483	return utf8toUtf16(pIn, N, 1);
				484	}
				485
				486	void sqlite3utf8to16le(const unsigned char pIn, int N){
				487	return utf8toUtf16(pIn, N, 0);
				488	}
				489
				490	/*
				491	** This routine does the work for sqlite3utf16to16le() and
				492	** sqlite3utf16to16be(). If big_endian is 1 the input string is
				493	** transformed in place to UTF-16BE encoding. If big_endian is 0 then
				494	** the input is transformed to UTF-16LE.
				495	**
				496	** Unless the first two bytes of the input string is a BOM, the input is
				497	** assumed to be UTF-16 encoded using the machines native byte ordering.
				498	*/
				499	static void utf16to16(void *pData, int N, int big_endian){
				500	UtfString inout;
				501	inout.pZ = (unsigned char *)pData;
				502	inout.c = 0;
				503	inout.n = N;
				504
				505	if( inout.n<0 ){
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	506	inout.n = sqlite3utf16ByteLen(inout.pZ, -1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	507	}
				508
drh	9c05483	2004-05-31 18:51:57 +0000	[diff] [blame]	509	if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	510	/* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */
				511	int i;
				512	for(i=0; i<(inout.n-inout.c); i += 2){
				513	char c1 = inout.pZ[i+inout.c];
				514	char c2 = inout.pZ[i+inout.c+1];
				515	inout.pZ[i] = c2;
				516	inout.pZ[i+1] = c1;
				517	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	518	}else if( inout.c ){
				519	memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
				520	}
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	521
				522	inout.pZ[inout.n-inout.c] = 0x00;
				523	inout.pZ[inout.n-inout.c+1] = 0x00;
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	524	}
				525
				526	/*
				527	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
				528	** string. The conversion occurs in-place. The output overwrites the
				529	** input. N bytes are converted. If N is negative everything is converted
				530	** up to the first \u0000 character.
				531	**
				532	** If the native byte order is little-endian and there is no BOM, then
				533	** this routine is a no-op. If there is a BOM at the start of the string,
				534	** it is removed.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	535	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	536	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				537	** using the library function swab().
				538	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	539	void sqlite3utf16to16le(void *pData, int N){
				540	utf16to16(pData, N, 0);
				541	}
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	542
				543	/*
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	544	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
				545	** string. The conversion occurs in-place. The output overwrites the
				546	** input. N bytes are converted. If N is negative everything is converted
				547	** up to the first \u0000 character.
				548	**
				549	** If the native byte order is little-endian and there is no BOM, then
				550	** this routine is a no-op. If there is a BOM at the start of the string,
				551	** it is removed.
				552	**
				553	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				554	** using the library function swab().
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	555	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	556	void sqlite3utf16to16be(void *pData, int N){
				557	utf16to16(pData, N, 1);
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	558	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	559
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	560	/*
				561	** This function is used to translate between UTF-8 and UTF-16. The
				562	** result is returned in dynamically allocated memory.
				563	*/
				564	int sqlite3utfTranslate(
drh	eb2e176	2004-05-27 01:53:56 +0000	[diff] [blame]	565	const void zData, int nData, / Input string */
				566	u8 enc1, /* Encoding of zData */
				567	void *zOut, int nOut, /* Output string */
				568	u8 enc2 /* Desired encoding of output */
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	569	){
				570	assert( enc1==TEXT_Utf8 \|\| enc1==TEXT_Utf16le \|\| enc1==TEXT_Utf16be );
				571	assert( enc2==TEXT_Utf8 \|\| enc2==TEXT_Utf16le \|\| enc2==TEXT_Utf16be );
				572	assert(
				573	(enc1==TEXT_Utf8 && (enc2==TEXT_Utf16le \|\| enc2==TEXT_Utf16be)) \|\|
				574	(enc2==TEXT_Utf8 && (enc1==TEXT_Utf16le \|\| enc1==TEXT_Utf16be))
				575	);
danielk1977	4adee20	2004-05-08 08:23:19 +0000	[diff] [blame]	576
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	577	if( enc1==TEXT_Utf8 ){
				578	if( enc2==TEXT_Utf16le ){
				579	*zOut = sqlite3utf8to16le(zData, nData);
				580	}else{
				581	*zOut = sqlite3utf8to16be(zData, nData);
				582	}
				583	if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977	c572ef7	2004-05-27 09:28:41 +0000	[diff] [blame]	584	nOut = sqlite3utf16ByteLen(zOut, -1);
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	585	}else{
				586	*zOut = sqlite3utf16to8(zData, nData, enc1==TEXT_Utf16be);
				587	if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977	c572ef7	2004-05-27 09:28:41 +0000	[diff] [blame]	588	nOut = strlen(zOut);
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	589	}
				590	return SQLITE_OK;
				591	}