Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: 4bb08b5e07e885d097e42fd38b86e11c356ef235 [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
danielk1977	d7e6964	2004-06-23 00:23:49 +0000	[diff] [blame^]	15	** $Id: utf.c,v 1.24 2004/06/23 00:23:49 danielk1977 Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
drh	51846b5	2004-05-28 16:00:21 +0000	[diff] [blame]	28	** Word-0 Word-1 Value
				29	** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				30	** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	51	**
				52	** This file contains the following public routines:
				53	**
				54	** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
				55	** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
				56	** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.
				57	** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.
				58	** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.
				59	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	60	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	61	#include <assert.h>
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	62	#include "sqliteInt.h"
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	63	#include "vdbeInt.h"
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	64
				65	/*
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	66	** The following macro, LOWERCASE(x), takes an integer representing a
				67	** unicode code point. The value returned is the same code point folded to
				68	** lower case, if applicable. SQLite currently understands the upper/lower
				69	** case relationship between the 26 characters used in the English
				70	** language only.
				71	**
				72	** This means that characters with umlauts etc. will not be folded
				73	** correctly (unless they are encoded as composite characters, which would
				74	** doubtless cause much trouble).
				75	*/
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	76	#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x)
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	77	static unsigned char UpperToLower[91] = {
				78	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
				79	18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
				80	36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
				81	54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
				82	104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
				83	122,
				84	};
				85
				86	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	87	** This table maps from the first byte of a UTF-8 character to the number
				88	** of trailing bytes expected. A value '255' indicates that the table key
				89	** is not a legal first byte for a UTF-8 character.
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	90	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	91	static const u8 xtra_utf8_bytes[256] = {
				92	/* 0xxxxxxx */
				93	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				94	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				95	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				96	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				97	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				98	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				99	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				100	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	101
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	102	/* 10wwwwww */
				103	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				104	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				105	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				106	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	107
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	108	/* 110yyyyy */
				109	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				110	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				111
				112	/* 1110zzzz */
				113	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				114
				115	/* 11110yyy */
				116	3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
				117	};
				118
				119	/*
				120	** This table maps from the number of trailing bytes in a UTF-8 character
				121	** to an integer constant that is effectively calculated for each character
				122	** read by a naive implementation of a UTF-8 character reader. The code
				123	** in the READ_UTF8 macro explains things best.
				124	*/
				125	static const int xtra_utf8_bits[4] = {
				126	0,
				127	12416, /* (0xC0 << 6) + (0x80) */
				128	925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
				129	63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
				130	};
				131
				132	#define READ_UTF8(zIn, c) { \
				133	int xtra; \
				134	c = *(zIn)++; \
				135	xtra = xtra_utf8_bytes[c]; \
				136	switch( xtra ){ \
				137	case 255: c = (int)0xFFFD; break; \
				138	case 3: c = (c<<6) + *(zIn)++; \
				139	case 2: c = (c<<6) + *(zIn)++; \
				140	case 1: c = (c<<6) + *(zIn)++; \
				141	c -= xtra_utf8_bits[xtra]; \
				142	} \
				143	}
				144
				145	#define SKIP_UTF8(zIn) { \
				146	zIn += (xtra_utf8_bytes[(u8 )zIn] + 1); \
				147	}
				148
				149	#define WRITE_UTF8(zOut, c) { \
				150	if( c<0x00080 ){ \
				151	*zOut++ = (c&0xFF); \
				152	} \
				153	else if( c<0x00800 ){ \
				154	*zOut++ = 0xC0 + ((c>>6)&0x1F); \
				155	*zOut++ = 0x80 + (c & 0x3F); \
				156	} \
				157	else if( c<0x10000 ){ \
				158	*zOut++ = 0xE0 + ((c>>12)&0x0F); \
				159	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				160	*zOut++ = 0x80 + (c & 0x3F); \
				161	}else{ \
				162	*zOut++ = 0xF0 + ((c>>18) & 0x07); \
				163	*zOut++ = 0x80 + ((c>>12) & 0x3F); \
				164	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				165	*zOut++ = 0x80 + (c & 0x3F); \
				166	} \
				167	}
				168
				169	#define WRITE_UTF16LE(zOut, c) { \
				170	if( c<=0xFFFF ){ \
				171	*zOut++ = (c&0x00FF); \
				172	*zOut++ = ((c>>8)&0x00FF); \
				173	}else{ \
				174	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				175	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				176	*zOut++ = (c&0x00FF); \
				177	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				178	} \
				179	}
				180
				181	#define WRITE_UTF16BE(zOut, c) { \
				182	if( c<=0xFFFF ){ \
				183	*zOut++ = ((c>>8)&0x00FF); \
				184	*zOut++ = (c&0x00FF); \
				185	}else{ \
				186	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				187	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				188	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				189	*zOut++ = (c&0x00FF); \
				190	} \
				191	}
				192
				193	#define READ_UTF16LE(zIn, c){ \
				194	c = (*zIn++); \
				195	c += ((*zIn++)<<8); \
				196	if( c>=0xD800 && c<=0xE000 ){ \
				197	int c2 = (*zIn++); \
				198	c2 += ((*zIn++)<<8); \
				199	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				200	} \
				201	}
				202
				203	#define READ_UTF16BE(zIn, c){ \
				204	c = ((*zIn++)<<8); \
				205	c += (*zIn++); \
				206	if( c>=0xD800 && c<=0xE000 ){ \
				207	int c2 = ((*zIn++)<<8); \
				208	c2 += (*zIn++); \
				209	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				210	} \
				211	}
				212
				213	/*
				214	** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
				215	** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
				216	*/
				217	/* #define TRANSLATE_TRACE 1 */
				218
				219	/*
				220	** This routine transforms the internal text encoding used by pMem to
				221	** desiredEnc. It is an error if the string is already of the desired
				222	** encoding, or if *pMem does not contain a string value.
				223	*/
				224	int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
				225	unsigned char zShort[NBFS]; /* Temporary short output buffer */
				226	int len; /* Maximum length of output string in bytes */
				227	unsigned char zOut; / Output buffer */
				228	unsigned char zIn; / Input iterator */
				229	unsigned char zTerm; / End of input */
				230	unsigned char z; / Output iterator */
				231	int c;
				232
				233	assert( pMem->flags&MEM_Str );
				234	assert( pMem->enc!=desiredEnc );
				235	assert( pMem->enc!=0 );
				236	assert( pMem->n>=0 );
				237
				238	#ifdef TRANSLATE_TRACE
				239	{
				240	char zBuf[100];
				241	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				242	fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	243	}
				244	#endif
				245
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	246	/* If the translation is between UTF-16 little and big endian, then
				247	** all that is required is to swap the byte order. This case is handled
				248	** differently from the others.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	249	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	250	if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
				251	u8 temp;
				252	sqlite3VdbeMemMakeWriteable(pMem);
				253	zIn = pMem->z;
				254	zTerm = &zIn[pMem->n];
				255	while( zIn<zTerm ){
				256	temp = *zIn;
				257	zIn = (zIn+1);
				258	zIn++;
				259	*zIn++ = temp;
				260	}
				261	pMem->enc = desiredEnc;
				262	goto translate_out;
				263	}
				264
danielk1977	d7e6964	2004-06-23 00:23:49 +0000	[diff] [blame^]	265	/* Set len to the maximum number of bytes required in the output buffer. */
				266	if( desiredEnc==SQLITE_UTF8 ){
				267	/* When converting from UTF-16, the maximum growth results from
				268	** translating a 2-byte character to a 3-byte UTF-8 character (i.e.
				269	** code-point 0xFFFC). A single byte is required for the output string
				270	** nul-terminator.
				271	*/
				272	len = (pMem->n/2) * 3 + 1;
				273	}else{
				274	/* When converting from UTF-8 to UTF-16 the maximum growth is caused
				275	** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
				276	** character. Two bytes are required in the output buffer for the
				277	** nul-terminator.
				278	*/
				279	len = pMem->n * 2 + 2;
				280	}
				281
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	282	/* Set zIn to point at the start of the input buffer and zTerm to point 1
				283	** byte past the end.
				284	**
				285	** Variable zOut is set to point at the output buffer. This may be space
				286	** obtained from malloc(), or Mem.zShort, if it large enough and not in
				287	** use, or the zShort array on the stack (see above).
				288	*/
				289	zIn = pMem->z;
				290	zTerm = &zIn[pMem->n];
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	291	if( len>NBFS ){
				292	zOut = sqliteMallocRaw(len);
				293	if( !zOut ) return SQLITE_NOMEM;
				294	}else{
				295	if( pMem->z==pMem->zShort ){
				296	zOut = zShort;
				297	}else{
				298	zOut = pMem->zShort;
				299	}
				300	}
				301	z = zOut;
				302
				303	if( pMem->enc==SQLITE_UTF8 ){
				304	if( desiredEnc==SQLITE_UTF16LE ){
				305	/* UTF-8 -> UTF-16 Little-endian */
				306	while( zIn<zTerm ){
				307	READ_UTF8(zIn, c);
				308	WRITE_UTF16LE(z, c);
				309	}
				310	WRITE_UTF16LE(z, 0);
				311	pMem->n = (z-zOut)-2;
				312	}else if( desiredEnc==SQLITE_UTF16BE ){
				313	/* UTF-8 -> UTF-16 Big-endian */
				314	while( zIn<zTerm ){
				315	READ_UTF8(zIn, c);
				316	WRITE_UTF16BE(z, c);
				317	}
				318	WRITE_UTF16BE(z, 0);
				319	pMem->n = (z-zOut)-2;
				320	}
				321	}else{
				322	assert( desiredEnc==SQLITE_UTF8 );
				323	if( pMem->enc==SQLITE_UTF16LE ){
				324	/* UTF-16 Little-endian -> UTF-8 */
				325	while( zIn<zTerm ){
				326	READ_UTF16LE(zIn, c);
				327	WRITE_UTF8(z, c);
				328	}
				329	WRITE_UTF8(z, 0);
				330	pMem->n = (z-zOut)-1;
				331	}else{
				332	/* UTF-16 Little-endian -> UTF-8 */
				333	while( zIn<zTerm ){
				334	READ_UTF16BE(zIn, c);
				335	WRITE_UTF8(z, c);
				336	}
				337	WRITE_UTF8(z, 0);
				338	pMem->n = (z-zOut)-1;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	339	}
				340	}
danielk1977	d7e6964	2004-06-23 00:23:49 +0000	[diff] [blame^]	341	assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	342
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	343	sqlite3VdbeMemRelease(pMem);
				344	pMem->flags &= ~(MEM_Static\|MEM_Dyn\|MEM_Ephem\|MEM_Short);
				345	pMem->enc = desiredEnc;
				346	if( (char *)zOut==pMem->zShort ){
				347	pMem->flags \|= (MEM_Term\|MEM_Short);
				348	}else if( zOut==zShort ){
				349	memcpy(pMem->zShort, zOut, len);
				350	zOut = pMem->zShort;
				351	pMem->flags \|= (MEM_Term\|MEM_Short);
				352	}else{
				353	pMem->flags \|= (MEM_Term\|MEM_Dyn);
				354	}
				355	pMem->z = zOut;
				356
				357	translate_out:
				358	#ifdef TRANSLATE_TRACE
				359	{
				360	char zBuf[100];
				361	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				362	fprintf(stderr, "OUTPUT: %s\n", zBuf);
				363	}
				364	#endif
				365	return SQLITE_OK;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	366	}
				367
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	368	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	369	** This routine checks for a byte-order mark at the beginning of the
				370	** UTF-16 string stored in *pMem. If one is present, it is removed and
				371	** the encoding of the Mem adjusted. This routine does not do any
				372	** byte-swapping, it just sets Mem.enc appropriately.
				373	**
				374	** The allocation (static, dynamic etc.) and encoding of the Mem may be
				375	** changed by this function.
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	376	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	377	int sqlite3VdbeMemHandleBom(Mem *pMem){
				378	int rc = SQLITE_OK;
				379	u8 bom = 0;
				380
				381	if( pMem->n<0 \|\| pMem->n>1 ){
				382	u8 b1 = (u8 )pMem->z;
				383	u8 b2 = (((u8 )pMem->z) + 1);
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	384	if( b1==0xFE && b2==0xFF ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	385	bom = SQLITE_UTF16BE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	386	}
				387	if( b1==0xFF && b2==0xFE ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	388	bom = SQLITE_UTF16LE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	389	}
				390	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	391
				392	if( bom ){
				393	if( pMem->flags & MEM_Short ){
				394	memmove(pMem->zShort, &pMem->zShort[2], NBFS-2);
				395	pMem->n -= 2;
				396	pMem->enc = bom;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	397	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	398	else if( pMem->flags & MEM_Dyn ){
				399	void (xDel)(void) = pMem->xDel;
				400	char *z = pMem->z;
				401	pMem->z = 0;
				402	pMem->xDel = 0;
				403	rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
				404	if( xDel ){
				405	xDel(z);
				406	}else{
				407	sqliteFree(z);
				408	}
				409	}else{
				410	rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
				411	SQLITE_TRANSIENT);
				412	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	413	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	414	return rc;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	415	}
				416
				417	/*
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	418	** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
				419	** return the number of unicode characters in pZ up to (but not including)
				420	** the first 0x00 byte. If nByte is not less than zero, return the
				421	** number of unicode characters in the first nByte of pZ (or up to
				422	** the first 0x00, whichever comes first).
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	423	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	424	int sqlite3utf8CharLen(const char *z, int nByte){
				425	int r = 0;
				426	const char *zTerm;
				427	if( nByte>0 ){
				428	zTerm = &z[nByte];
				429	}else{
				430	zTerm = (const char *)(-1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	431	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	432	assert( z<=zTerm );
				433	while( *z!=0 && z<zTerm ){
				434	SKIP_UTF8(z);
				435	r++;
				436	}
				437	return r;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	438	}
				439
				440	/*
				441	** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
				442	** return the number of bytes up to (but not including), the first pair
				443	** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
				444	** then return the number of bytes in the first nChar unicode characters
				445	** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
				446	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	447	int sqlite3utf16ByteLen(const void *zIn, int nChar){
				448	int c = 1;
				449	char const *z = zIn;
				450	int n = 0;
				451	if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
				452	while( c && ((nChar<0) \|\| n<nChar) ){
				453	READ_UTF16BE(z, c);
				454	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	455	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	456	}else{
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	457	while( c && ((nChar<0) \|\| n<nChar) ){
				458	READ_UTF16LE(z, c);
				459	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	460	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	461	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	462	return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	463	}
				464
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	465	/*
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	466	** Compare two UTF-8 strings for equality using the "LIKE" operator of
				467	** SQL. The '%' character matches any sequence of 0 or more
				468	** characters and '_' matches any single character. Case is
				469	** not significant.
				470	*/
				471	int sqlite3utf8LikeCompare(
				472	const unsigned char *zPattern,
				473	const unsigned char *zString
				474	){
				475	register int c;
				476	int c2;
				477
				478	while( (c = LOWERCASE(*zPattern))!=0 ){
				479	switch( c ){
				480	case '%': {
				481	while( (c=zPattern[1]) == '%' \|\| c == '_' ){
				482	if( c=='_' ){
				483	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	484	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	485	}
				486	zPattern++;
				487	}
				488	if( c==0 ) return 1;
				489	c = LOWERCASE(c);
				490	while( (c2=LOWERCASE(*zString))!=0 ){
				491	while( c2 != 0 && c2 != c ){
				492	zString++;
				493	c2 = LOWERCASE(*zString);
				494	}
				495	if( c2==0 ) return 0;
				496	if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	497	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	498	}
				499	return 0;
				500	}
				501	case '_': {
				502	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	503	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	504	zPattern++;
				505	break;
				506	}
				507	default: {
				508	if( c != LOWERCASE(*zString) ) return 0;
				509	zPattern++;
				510	zString++;
				511	break;
				512	}
				513	}
				514	}
				515	return *zString==0;
				516	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	517
drh	38f8271	2004-06-18 17:10:16 +0000	[diff] [blame]	518	#if defined(SQLITE_TEST)
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	519	/*
				520	** This routine is called from the TCL test function "translate_selftest".
				521	** It checks that the primitives for serializing and deserializing
				522	** characters in each encoding are inverses of each other.
				523	*/
				524	void sqlite3utfSelfTest(){
				525	int i;
				526	unsigned char zBuf[20];
				527	unsigned char *z;
				528	int n;
				529	int c;
				530
				531	for(i=0; 0 && i<0x00110000; i++){
				532	z = zBuf;
				533	WRITE_UTF8(z, i);
				534	n = z-zBuf;
				535	z = zBuf;
				536	READ_UTF8(z, c);
				537	assert( c==i );
				538	assert( (z-zBuf)==n );
				539	}
				540	for(i=0; i<0x00110000; i++){
				541	if( i>=0xD800 && i<=0xE000 ) continue;
				542	z = zBuf;
				543	WRITE_UTF16LE(z, i);
				544	n = z-zBuf;
				545	z = zBuf;
				546	READ_UTF16LE(z, c);
				547	assert( c==i );
				548	assert( (z-zBuf)==n );
				549	}
				550	for(i=0; i<0x00110000; i++){
				551	if( i>=0xD800 && i<=0xE000 ) continue;
				552	z = zBuf;
				553	WRITE_UTF16BE(z, i);
				554	n = z-zBuf;
				555	z = zBuf;
				556	READ_UTF16BE(z, c);
				557	assert( c==i );
				558	assert( (z-zBuf)==n );
				559	}
				560	}
				561	#endif