Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: 5ee017c0dc0034e7e8fc5a897fc08de63c20fc5c [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	15	** $Id: utf.c,v 1.19 2004/06/12 00:42:35 danielk1977 Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
drh	51846b5	2004-05-28 16:00:21 +0000	[diff] [blame]	28	** Word-0 Word-1 Value
				29	** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				30	** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	51	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	52	#include <assert.h>
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	53	#include "sqliteInt.h"
				54
				55	typedef struct UtfString UtfString;
				56	struct UtfString {
				57	unsigned char pZ; / Raw string data */
				58	int n; /* Allocated length of pZ in bytes */
				59	int c; /* Number of pZ bytes already read or written */
				60	};
				61
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	62	/*
				63	** These two macros are used to interpret the first two bytes of the
				64	** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
				65	** interpretation, LE16() for little-endian.
				66	*/
				67	#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
				68	#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
				69
				70	/*
				71	** READ_16 interprets the first two bytes of the unsigned char array pZ
				72	** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
				73	** is big-endian, otherwise little-endian.
				74	*/
				75	#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
				76
				77	/*
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	78	** The following macro, LOWERCASE(x), takes an integer representing a
				79	** unicode code point. The value returned is the same code point folded to
				80	** lower case, if applicable. SQLite currently understands the upper/lower
				81	** case relationship between the 26 characters used in the English
				82	** language only.
				83	**
				84	** This means that characters with umlauts etc. will not be folded
				85	** correctly (unless they are encoded as composite characters, which would
				86	** doubtless cause much trouble).
				87	*/
				88	#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x);
				89	static unsigned char UpperToLower[91] = {
				90	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
				91	18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
				92	36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
				93	54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
				94	104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
				95	122,
				96	};
				97
				98	/*
				99	** The first parameter, zStr, points at a unicode string. This routine
				100	** reads a single character from the string and returns the codepoint value
				101	** of the character read.
				102	**
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	103	** The value of pEnc is the string encoding. If pEnc is SQLITE_UTF16LE or
				104	** SQLITE_UTF16BE, and the first character read is a byte-order-mark, then
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	105	** the value of *pEnc is modified if necessary. In this case the next
				106	** character is read and it's code-point value returned.
				107	**
				108	** The value of *pOffset is the byte-offset in zStr from which to begin
				109	** reading. It is incremented by the number of bytes read by this function.
				110	**
				111	** If the fourth parameter, fold, is non-zero, then codepoint values are
				112	** folded to lower-case before being returned. See comments for macro
				113	** LOWERCASE(x) for details.
				114	*/
				115	int sqlite3ReadUniChar(const char zStr, int pOffset, u8 *pEnc, int fold){
				116	int ret = 0;
				117
				118	switch( *pEnc ){
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	119	case SQLITE_UTF8: {
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	120
				121	#if 0
				122	static const int initVal[] = {
				123	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
				124	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
				125	30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
				126	45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
				127	60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
				128	75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
				129	90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
				130	105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
				131	120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
				132	135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
				133	150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
				134	165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
				135	180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2,
				136	3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
				137	18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
				138	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
				139	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 254,
				140	255,
				141	};
				142	ret = initVal[(unsigned char)zStr[(*pOffset)++]];
				143	while( (0xc0&zStr[*pOffset])==0x80 ){
				144	ret = (ret<<6) \| (0x3f&(zStr[(*pOffset)++]));
				145	}
				146	#endif
				147
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	148	struct Utf8TblRow {
				149	u8 b1_mask;
				150	u8 b1_masked_val;
				151	u8 b1_value_mask;
				152	int trailing_bytes;
				153	};
				154	static const struct Utf8TblRow utf8tbl[] = {
				155	{ 0x80, 0x00, 0x7F, 0 },
				156	{ 0xE0, 0xC0, 0x1F, 1 },
				157	{ 0xF0, 0xE0, 0x0F, 2 },
				158	{ 0xF8, 0xF0, 0x0E, 3 },
				159	{ 0, 0, 0, 0}
				160	};
				161
				162	u8 b1; /* First byte of the potentially multi-byte utf-8 character */
				163	int ii;
				164	struct Utf8TblRow const *pRow;
				165
				166	pRow = &(utf8tbl[0]);
				167
				168	b1 = zStr[(*pOffset)++];
				169	while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
				170	pRow++;
				171	}
				172	if( !pRow->b1_mask ){
				173	return (int)0xFFFD;
				174	}
				175
				176	ret = (u32)(b1&pRow->b1_value_mask);
				177	for( ii=0; ii<pRow->trailing_bytes; ii++ ){
				178	u8 b = zStr[(*pOffset)++];
				179	if( (b&0xC0)!=0x80 ){
				180	return (int)0xFFFD;
				181	}
				182	ret = (ret<<6) + (u32)(b&0x3F);
				183	}
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	184	break;
				185	}
				186
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	187	case SQLITE_UTF16LE:
				188	case SQLITE_UTF16BE: {
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	189	u32 code_point; /* the first code-point in the character */
				190	u32 code_point2; /* the second code-point in the character, if any */
				191
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	192	code_point = READ_16(&zStr[pOffset], (pEnc==SQLITE_UTF16BE));
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	193	*pOffset += 2;
				194
				195	/* If this is a non-surrogate code-point, just cast it to an int and
				196	** this is the code-point value.
				197	*/
				198	if( code_point<0xD800 \|\| code_point>0xE000 ){
				199	ret = code_point;
				200	break;
				201	}
				202
				203	/* If this is a trailing surrogate code-point, then the string is
				204	** malformed; return the replacement character.
				205	*/
				206	if( code_point>0xDBFF ){
				207	return (int)0xFFFD;
				208	}
				209
				210	/* The code-point just read is a leading surrogate code-point. If their
				211	** is not enough data left or the next code-point is not a trailing
				212	** surrogate, return the replacement character.
				213	*/
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	214	code_point2 = READ_16(&zStr[pOffset], (pEnc==SQLITE_UTF16BE));
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	215	*pOffset += 2;
				216	if( code_point2<0xDC00 \|\| code_point>0xDFFF ){
				217	return (int)0xFFFD;
				218	}
				219
				220	ret = (
				221	(((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
				222	((code_point&0x003F)<<10) + /* xxxxxx */
				223	(code_point2&0x03FF) /* yy yyyyyyyy */
				224	);
				225	}
				226	default:
				227	assert(0);
				228	}
				229
				230	if( fold ){
				231	return LOWERCASE(ret);
				232	}
				233	return ret;
				234	}
				235
				236	/*
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	237	** Read the BOM from the start of *pStr, if one is present. Return zero
				238	** for little-endian, non-zero for big-endian. If no BOM is present, return
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	239	** the value of the parameter "big_endian".
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	240	**
				241	** Return values:
				242	** 1 -> big-endian string
				243	** 0 -> little-endian string
				244	*/
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	245	static int readUtf16Bom(UtfString *pStr, int big_endian){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	246	/* The BOM must be the first thing read from the string */
				247	assert( pStr->c==0 );
				248
				249	/* If the string data consists of 1 byte or less, the BOM will make no
				250	** difference anyway. In this case just fall through to the default case
				251	** and return the native byte-order for this machine.
				252	**
				253	** Otherwise, check the first 2 bytes of the string to see if a BOM is
				254	** present.
				255	*/
				256	if( pStr->n>1 ){
danielk1977	193c72f	2004-06-02 00:29:24 +0000	[diff] [blame]	257	u8 bom = sqlite3UtfReadBom(pStr->pZ, 2);
				258	if( bom ){
				259	pStr->c += 2;
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	260	return (bom==SQLITE_UTF16LE)?0:1;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	261	}
				262	}
				263
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	264	return big_endian;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	265	}
				266
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	267	/*
				268	** zData is a UTF-16 encoded string, nData bytes in length. This routine
				269	** checks if there is a byte-order mark at the start of zData. If no
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	270	** byte order mark is found 0 is returned. Otherwise SQLITE_UTF16BE or
				271	** SQLITE_UTF16LE is returned, depending on whether The BOM indicates that
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	272	** the text is big-endian or little-endian.
				273	*/
				274	u8 sqlite3UtfReadBom(const void *zData, int nData){
				275	if( nData<0 \|\| nData>1 ){
				276	u8 b1 = (u8 )zData;
				277	u8 b2 = (((u8 )zData) + 1);
				278	if( b1==0xFE && b2==0xFF ){
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	279	return SQLITE_UTF16BE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	280	}
				281	if( b1==0xFF && b2==0xFE ){
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	282	return SQLITE_UTF16LE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	283	}
				284	}
				285	return 0;
				286	}
				287
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	288
				289	/*
				290	** Read a single unicode character from the UTF-8 encoded string *pStr. The
				291	** value returned is a unicode scalar value. In the case of malformed
				292	** strings, the unicode replacement character U+FFFD may be returned.
				293	*/
				294	static u32 readUtf8(UtfString *pStr){
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	295	u8 enc = SQLITE_UTF8;
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	296	return sqlite3ReadUniChar(pStr->pZ, &pStr->c, &enc, 0);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	297	}
				298
				299	/*
				300	** Write the unicode character 'code' to the string pStr using UTF-8
				301	** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
				302	*/
				303	static int writeUtf8(UtfString *pStr, u32 code){
				304	struct Utf8WriteTblRow {
				305	u32 max_code;
				306	int trailing_bytes;
				307	u8 b1_and_mask;
				308	u8 b1_or_mask;
				309	};
				310	static const struct Utf8WriteTblRow utf8tbl[] = {
				311	{0x0000007F, 0, 0x7F, 0x00},
				312	{0x000007FF, 1, 0xDF, 0xC0},
				313	{0x0000FFFF, 2, 0xEF, 0xE0},
				314	{0x0010FFFF, 3, 0xF7, 0xF0},
				315	{0x00000000, 0, 0x00, 0x00}
				316	};
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	317	const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	318
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	319	while( code>pRow->max_code ){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	320	assert( pRow->max_code );
				321	pRow++;
				322	}
				323
				324	/* Ensure there is enough room left in the output buffer to write
				325	** this UTF-8 character.
				326	*/
				327	assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
				328
				329	/* Write the UTF-8 encoded character to pStr. All cases below are
				330	** intentionally fall-through.
				331	*/
				332	switch( pRow->trailing_bytes ){
				333	case 3:
				334	pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)\|0x80;
				335	code = code>>6;
				336	case 2:
				337	pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)\|0x80;
				338	code = code>>6;
				339	case 1:
				340	pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)\|0x80;
				341	code = code>>6;
				342	case 0:
				343	pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))\|(pRow->b1_or_mask);
				344	}
				345	pStr->c += (pRow->trailing_bytes + 1);
				346
				347	return 0;
				348	}
				349
				350	/*
				351	** Read a single unicode character from the UTF-16 encoded string *pStr. The
				352	** value returned is a unicode scalar value. In the case of malformed
				353	** strings, the unicode replacement character U+FFFD may be returned.
				354	**
				355	** If big_endian is true, the string is assumed to be UTF-16BE encoded.
				356	** Otherwise, it is UTF-16LE encoded.
				357	*/
				358	static u32 readUtf16(UtfString *pStr, int big_endian){
				359	u32 code_point; /* the first code-point in the character */
				360
				361	/* If there is only one byte of data left in the string, return the
				362	** replacement character.
				363	*/
				364	if( (pStr->n-pStr->c)==1 ){
				365	pStr->c++;
				366	return (int)0xFFFD;
				367	}
				368
				369	code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
				370	pStr->c += 2;
				371
				372	/* If this is a non-surrogate code-point, just cast it to an int and
				373	** return the code-point value.
				374	*/
				375	if( code_point<0xD800 \|\| code_point>0xE000 ){
				376	return code_point;
				377	}
				378
				379	/* If this is a trailing surrogate code-point, then the string is
				380	** malformed; return the replacement character.
				381	*/
				382	if( code_point>0xDBFF ){
				383	return 0xFFFD;
				384	}
				385
				386	/* The code-point just read is a leading surrogate code-point. If their
				387	** is not enough data left or the next code-point is not a trailing
				388	** surrogate, return the replacement character.
				389	*/
				390	if( (pStr->n-pStr->c)>1 ){
				391	u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
				392	if( code_point2<0xDC00 \|\| code_point>0xDFFF ){
				393	return 0xFFFD;
				394	}
				395	pStr->c += 2;
				396
				397	return (
				398	(((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
				399	((code_point&0x003F)<<10) + /* xxxxxx */
				400	(code_point2&0x03FF) /* yy yyyyyyyy */
				401	);
				402
				403	}else{
				404	return (int)0xFFFD;
				405	}
				406
				407	/* not reached */
				408	}
				409
				410	static int writeUtf16(UtfString *pStr, int code, int big_endian){
				411	int bytes;
				412	unsigned char *hi_byte;
				413	unsigned char *lo_byte;
				414
				415	bytes = (code>0x0000FFFF?4:2);
				416
				417	/* Ensure there is enough room left in the output buffer to write
				418	** this UTF-8 character.
				419	*/
				420	assert( (pStr->n-pStr->c)>=bytes );
				421
				422	/* Initialise hi_byte and lo_byte to point at the locations into which
				423	** the MSB and LSB of the (first) 16-bit unicode code-point written for
				424	** this character.
				425	*/
				426	hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
				427	lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
				428
				429	if( bytes==2 ){
				430	*hi_byte = (u8)((code&0x0000FF00)>>8);
				431	*lo_byte = (u8)(code&0x000000FF);
				432	}else{
				433	u32 wrd;
				434	wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)\|0x0000D800;
				435	*hi_byte = (u8)((wrd&0x0000FF00)>>8);
				436	*lo_byte = (u8)(wrd&0x000000FF);
				437
				438	wrd = (code&0x000003FF)\|0x0000DC00;
				439	*(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
				440	*(lo_byte+2) = (u8)(wrd&0x000000FF);
				441	}
				442
				443	pStr->c += bytes;
				444
				445	return 0;
				446	}
				447
				448	/*
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	449	** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
				450	** return the number of unicode characters in pZ up to (but not including)
				451	** the first 0x00 byte. If nByte is not less than zero, return the
				452	** number of unicode characters in the first nByte of pZ (or up to
				453	** the first 0x00, whichever comes first).
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	454	*/
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	455	int sqlite3utf8CharLen(const char *pZ, int nByte){
				456	UtfString str;
				457	int ret = 0;
				458	u32 code = 1;
				459
				460	str.pZ = (char *)pZ;
				461	str.n = nByte;
				462	str.c = 0;
				463
				464	while( (nByte<0 \|\| str.c<str.n) && code!=0 ){
				465	code = readUtf8(&str);
				466	ret++;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	467	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	468	if( code==0 ) ret--;
				469
				470	return ret;
				471	}
				472
				473	/*
				474	** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
				475	** return the number of bytes up to (but not including), the first pair
				476	** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
				477	** then return the number of bytes in the first nChar unicode characters
				478	** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
				479	*/
				480	int sqlite3utf16ByteLen(const void *pZ, int nChar){
				481	if( nChar<0 ){
danielk1977	e7d00f5	2004-05-29 02:44:02 +0000	[diff] [blame]	482	const unsigned char pC1 = (unsigned char )pZ;
				483	const unsigned char pC2 = (unsigned char )pZ+1;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	484	while( pC1 \|\| pC2 ){
				485	pC1 += 2;
				486	pC2 += 2;
				487	}
				488	return pC1-(unsigned char *)pZ;
				489	}else{
				490	UtfString str;
				491	u32 code = 1;
				492	int big_endian;
				493	int nRead = 0;
				494	int ret;
				495
				496	str.pZ = (char *)pZ;
				497	str.c = 0;
				498	str.n = -1;
				499
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	500	/* Check for a BOM. We just ignore it if there is one, it's only read
				501	** so that it is not counted as a character.
				502	*/
				503	big_endian = readUtf16Bom(&str, 0);
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	504	ret = 0-str.c;
				505
				506	while( code!=0 && nRead<nChar ){
				507	code = readUtf16(&str, big_endian);
				508	nRead++;
				509	}
				510	if( code==0 ){
				511	ret -= 2;
				512	}
				513	return str.c + ret;
				514	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	515	}
				516
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	517	/*
				518	** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
				519	** "BOM") into a UTF-8 string. The UTF-8 string is written into space
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	520	** obtained from sqlite3Malloc() and must be released by the calling function.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	521	**
				522	** The parameter N is the number of bytes in the UTF-16 string. If N is
				523	** negative, the entire string up to the first \u0000 character is translated.
				524	**
				525	** The returned UTF-8 string is always \000 terminated.
				526	*/
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	527	unsigned char sqlite3utf16to8(const void pData, int N, int big_endian){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	528	UtfString in;
				529	UtfString out;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	530
				531	out.pZ = 0;
				532
				533	in.pZ = (unsigned char *)pData;
				534	in.n = N;
				535	in.c = 0;
				536
				537	if( in.n<0 ){
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	538	in.n = sqlite3utf16ByteLen(in.pZ, -1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	539	}
				540
				541	/* A UTF-8 encoding of a unicode string can require at most 1.5 times as
				542	** much space to store as the same string encoded using UTF-16. Allocate
				543	** this now.
				544	*/
				545	out.n = (in.n*1.5) + 1;
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	546	out.pZ = sqliteMalloc(out.n);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	547	if( !out.pZ ){
				548	return 0;
				549	}
				550	out.c = 0;
				551
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	552	big_endian = readUtf16Bom(&in, big_endian);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	553	while( in.c<in.n ){
				554	writeUtf8(&out, readUtf16(&in, big_endian));
				555	}
				556
				557	/* Add the NULL-terminator character */
				558	assert( out.c<out.n );
				559	out.pZ[out.c] = 0x00;
				560
				561	return out.pZ;
				562	}
				563
				564	static void utf8toUtf16(const unsigned char pIn, int N, int big_endian){
				565	UtfString in;
				566	UtfString out;
				567
				568	in.pZ = (unsigned char *)pIn;
				569	in.n = N;
				570	in.c = 0;
				571
				572	if( in.n<0 ){
				573	in.n = strlen(in.pZ);
				574	}
				575
				576	/* A UTF-16 encoding of a unicode string can require at most twice as
				577	** much space to store as the same string encoded using UTF-8. Allocate
				578	** this now.
				579	*/
				580	out.n = (in.n*2) + 2;
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	581	out.pZ = sqliteMalloc(out.n);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	582	if( !out.pZ ){
				583	return 0;
				584	}
				585	out.c = 0;
				586
				587	while( in.c<in.n ){
				588	writeUtf16(&out, readUtf8(&in), big_endian);
				589	}
				590
				591	/* Add the NULL-terminator character */
				592	assert( (out.c+1)<out.n );
				593	out.pZ[out.c] = 0x00;
				594	out.pZ[out.c+1] = 0x00;
				595
				596	return out.pZ;
				597	}
				598
				599	/*
				600	** Translate UTF-8 to UTF-16BE or UTF-16LE
				601	*/
				602	void sqlite3utf8to16be(const unsigned char pIn, int N){
				603	return utf8toUtf16(pIn, N, 1);
				604	}
				605
				606	void sqlite3utf8to16le(const unsigned char pIn, int N){
				607	return utf8toUtf16(pIn, N, 0);
				608	}
				609
				610	/*
				611	** This routine does the work for sqlite3utf16to16le() and
				612	** sqlite3utf16to16be(). If big_endian is 1 the input string is
				613	** transformed in place to UTF-16BE encoding. If big_endian is 0 then
				614	** the input is transformed to UTF-16LE.
				615	**
				616	** Unless the first two bytes of the input string is a BOM, the input is
				617	** assumed to be UTF-16 encoded using the machines native byte ordering.
				618	*/
				619	static void utf16to16(void *pData, int N, int big_endian){
				620	UtfString inout;
				621	inout.pZ = (unsigned char *)pData;
				622	inout.c = 0;
				623	inout.n = N;
				624
				625	if( inout.n<0 ){
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	626	inout.n = sqlite3utf16ByteLen(inout.pZ, -1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	627	}
				628
drh	9c05483	2004-05-31 18:51:57 +0000	[diff] [blame]	629	if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	630	/* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */
				631	int i;
				632	for(i=0; i<(inout.n-inout.c); i += 2){
				633	char c1 = inout.pZ[i+inout.c];
				634	char c2 = inout.pZ[i+inout.c+1];
				635	inout.pZ[i] = c2;
				636	inout.pZ[i+1] = c1;
				637	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	638	}else if( inout.c ){
				639	memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
				640	}
danielk1977	295ba55	2004-05-19 10:34:51 +0000	[diff] [blame]	641
				642	inout.pZ[inout.n-inout.c] = 0x00;
				643	inout.pZ[inout.n-inout.c+1] = 0x00;
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	644	}
				645
				646	/*
				647	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
				648	** string. The conversion occurs in-place. The output overwrites the
				649	** input. N bytes are converted. If N is negative everything is converted
				650	** up to the first \u0000 character.
				651	**
				652	** If the native byte order is little-endian and there is no BOM, then
				653	** this routine is a no-op. If there is a BOM at the start of the string,
				654	** it is removed.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	655	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	656	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				657	** using the library function swab().
				658	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	659	void sqlite3utf16to16le(void *pData, int N){
				660	utf16to16(pData, N, 0);
				661	}
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	662
				663	/*
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	664	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
				665	** string. The conversion occurs in-place. The output overwrites the
				666	** input. N bytes are converted. If N is negative everything is converted
				667	** up to the first \u0000 character.
				668	**
				669	** If the native byte order is little-endian and there is no BOM, then
				670	** this routine is a no-op. If there is a BOM at the start of the string,
				671	** it is removed.
				672	**
				673	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				674	** using the library function swab().
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	675	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	676	void sqlite3utf16to16be(void *pData, int N){
				677	utf16to16(pData, N, 1);
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	678	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	679
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	680	/*
				681	** This function is used to translate between UTF-8 and UTF-16. The
				682	** result is returned in dynamically allocated memory.
				683	*/
				684	int sqlite3utfTranslate(
drh	eb2e176	2004-05-27 01:53:56 +0000	[diff] [blame]	685	const void zData, int nData, / Input string */
				686	u8 enc1, /* Encoding of zData */
				687	void *zOut, int nOut, /* Output string */
				688	u8 enc2 /* Desired encoding of output */
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	689	){
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	690	assert( enc1==SQLITE_UTF8 \|\| enc1==SQLITE_UTF16LE \|\| enc1==SQLITE_UTF16BE );
				691	assert( enc2==SQLITE_UTF8 \|\| enc2==SQLITE_UTF16LE \|\| enc2==SQLITE_UTF16BE );
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	692	assert(
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	693	(enc1==SQLITE_UTF8 && (enc2==SQLITE_UTF16LE \|\| enc2==SQLITE_UTF16BE)) \|\|
				694	(enc2==SQLITE_UTF8 && (enc1==SQLITE_UTF16LE \|\| enc1==SQLITE_UTF16BE))
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	695	);
danielk1977	4adee20	2004-05-08 08:23:19 +0000	[diff] [blame]	696
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	697	if( enc1==SQLITE_UTF8 ){
				698	if( enc2==SQLITE_UTF16LE ){
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	699	*zOut = sqlite3utf8to16le(zData, nData);
				700	}else{
				701	*zOut = sqlite3utf8to16be(zData, nData);
				702	}
				703	if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977	c572ef7	2004-05-27 09:28:41 +0000	[diff] [blame]	704	nOut = sqlite3utf16ByteLen(zOut, -1);
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	705	}else{
danielk1977	dc8453f	2004-06-12 00:42:34 +0000	[diff] [blame]	706	*zOut = sqlite3utf16to8(zData, nData, enc1==SQLITE_UTF16BE);
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	707	if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977	c572ef7	2004-05-27 09:28:41 +0000	[diff] [blame]	708	nOut = strlen(zOut);
danielk1977	b1bc953	2004-05-22 03:05:33 +0000	[diff] [blame]	709	}
				710	return SQLITE_OK;
				711	}