Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: 98e13abf4ece87b48606c49c0934af9d63a8aef2 [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	15	** $Id: utf.c,v 1.21 2004/06/18 04:24:55 danielk1977 Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
drh	51846b5	2004-05-28 16:00:21 +0000	[diff] [blame]	28	** Word-0 Word-1 Value
				29	** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				30	** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	51	**
				52	** This file contains the following public routines:
				53	**
				54	** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
				55	** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
				56	** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.
				57	** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.
				58	** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.
				59	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	60	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	61	#include <assert.h>
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	62	#include "sqliteInt.h"
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	63	#include "vdbeInt.h"
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	64
				65	/*
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	66	** The following macro, LOWERCASE(x), takes an integer representing a
				67	** unicode code point. The value returned is the same code point folded to
				68	** lower case, if applicable. SQLite currently understands the upper/lower
				69	** case relationship between the 26 characters used in the English
				70	** language only.
				71	**
				72	** This means that characters with umlauts etc. will not be folded
				73	** correctly (unless they are encoded as composite characters, which would
				74	** doubtless cause much trouble).
				75	*/
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	76	#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x)
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	77	static unsigned char UpperToLower[91] = {
				78	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
				79	18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
				80	36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
				81	54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
				82	104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
				83	122,
				84	};
				85
				86	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	87	** This table maps from the first byte of a UTF-8 character to the number
				88	** of trailing bytes expected. A value '255' indicates that the table key
				89	** is not a legal first byte for a UTF-8 character.
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	90	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	91	static const u8 xtra_utf8_bytes[256] = {
				92	/* 0xxxxxxx */
				93	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				94	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				95	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				96	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				97	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				98	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				99	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				100	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	101
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	102	/* 10wwwwww */
				103	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				104	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				105	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				106	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	107
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	108	/* 110yyyyy */
				109	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				110	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				111
				112	/* 1110zzzz */
				113	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				114
				115	/* 11110yyy */
				116	3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
				117	};
				118
				119	/*
				120	** This table maps from the number of trailing bytes in a UTF-8 character
				121	** to an integer constant that is effectively calculated for each character
				122	** read by a naive implementation of a UTF-8 character reader. The code
				123	** in the READ_UTF8 macro explains things best.
				124	*/
				125	static const int xtra_utf8_bits[4] = {
				126	0,
				127	12416, /* (0xC0 << 6) + (0x80) */
				128	925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
				129	63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
				130	};
				131
				132	#define READ_UTF8(zIn, c) { \
				133	int xtra; \
				134	c = *(zIn)++; \
				135	xtra = xtra_utf8_bytes[c]; \
				136	switch( xtra ){ \
				137	case 255: c = (int)0xFFFD; break; \
				138	case 3: c = (c<<6) + *(zIn)++; \
				139	case 2: c = (c<<6) + *(zIn)++; \
				140	case 1: c = (c<<6) + *(zIn)++; \
				141	c -= xtra_utf8_bits[xtra]; \
				142	} \
				143	}
				144
				145	#define SKIP_UTF8(zIn) { \
				146	zIn += (xtra_utf8_bytes[(u8 )zIn] + 1); \
				147	}
				148
				149	#define WRITE_UTF8(zOut, c) { \
				150	if( c<0x00080 ){ \
				151	*zOut++ = (c&0xFF); \
				152	} \
				153	else if( c<0x00800 ){ \
				154	*zOut++ = 0xC0 + ((c>>6)&0x1F); \
				155	*zOut++ = 0x80 + (c & 0x3F); \
				156	} \
				157	else if( c<0x10000 ){ \
				158	*zOut++ = 0xE0 + ((c>>12)&0x0F); \
				159	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				160	*zOut++ = 0x80 + (c & 0x3F); \
				161	}else{ \
				162	*zOut++ = 0xF0 + ((c>>18) & 0x07); \
				163	*zOut++ = 0x80 + ((c>>12) & 0x3F); \
				164	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				165	*zOut++ = 0x80 + (c & 0x3F); \
				166	} \
				167	}
				168
				169	#define WRITE_UTF16LE(zOut, c) { \
				170	if( c<=0xFFFF ){ \
				171	*zOut++ = (c&0x00FF); \
				172	*zOut++ = ((c>>8)&0x00FF); \
				173	}else{ \
				174	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				175	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				176	*zOut++ = (c&0x00FF); \
				177	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				178	} \
				179	}
				180
				181	#define WRITE_UTF16BE(zOut, c) { \
				182	if( c<=0xFFFF ){ \
				183	*zOut++ = ((c>>8)&0x00FF); \
				184	*zOut++ = (c&0x00FF); \
				185	}else{ \
				186	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				187	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				188	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				189	*zOut++ = (c&0x00FF); \
				190	} \
				191	}
				192
				193	#define READ_UTF16LE(zIn, c){ \
				194	c = (*zIn++); \
				195	c += ((*zIn++)<<8); \
				196	if( c>=0xD800 && c<=0xE000 ){ \
				197	int c2 = (*zIn++); \
				198	c2 += ((*zIn++)<<8); \
				199	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				200	} \
				201	}
				202
				203	#define READ_UTF16BE(zIn, c){ \
				204	c = ((*zIn++)<<8); \
				205	c += (*zIn++); \
				206	if( c>=0xD800 && c<=0xE000 ){ \
				207	int c2 = ((*zIn++)<<8); \
				208	c2 += (*zIn++); \
				209	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				210	} \
				211	}
				212
				213	/*
				214	** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
				215	** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
				216	*/
				217	/* #define TRANSLATE_TRACE 1 */
				218
				219	/*
				220	** This routine transforms the internal text encoding used by pMem to
				221	** desiredEnc. It is an error if the string is already of the desired
				222	** encoding, or if *pMem does not contain a string value.
				223	*/
				224	int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
				225	unsigned char zShort[NBFS]; /* Temporary short output buffer */
				226	int len; /* Maximum length of output string in bytes */
				227	unsigned char zOut; / Output buffer */
				228	unsigned char zIn; / Input iterator */
				229	unsigned char zTerm; / End of input */
				230	unsigned char z; / Output iterator */
				231	int c;
				232
				233	assert( pMem->flags&MEM_Str );
				234	assert( pMem->enc!=desiredEnc );
				235	assert( pMem->enc!=0 );
				236	assert( pMem->n>=0 );
				237
				238	#ifdef TRANSLATE_TRACE
				239	{
				240	char zBuf[100];
				241	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				242	fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	243	}
				244	#endif
				245
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	246	/* If the translation is between UTF-16 little and big endian, then
				247	** all that is required is to swap the byte order. This case is handled
				248	** differently from the others.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	249	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	250	if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
				251	u8 temp;
				252	sqlite3VdbeMemMakeWriteable(pMem);
				253	zIn = pMem->z;
				254	zTerm = &zIn[pMem->n];
				255	while( zIn<zTerm ){
				256	temp = *zIn;
				257	zIn = (zIn+1);
				258	zIn++;
				259	*zIn++ = temp;
				260	}
				261	pMem->enc = desiredEnc;
				262	goto translate_out;
				263	}
				264
				265	/* Set zIn to point at the start of the input buffer and zTerm to point 1
				266	** byte past the end.
				267	**
				268	** Variable zOut is set to point at the output buffer. This may be space
				269	** obtained from malloc(), or Mem.zShort, if it large enough and not in
				270	** use, or the zShort array on the stack (see above).
				271	*/
				272	zIn = pMem->z;
				273	zTerm = &zIn[pMem->n];
				274	len = pMem->n*2 + 2;
				275	if( len>NBFS ){
				276	zOut = sqliteMallocRaw(len);
				277	if( !zOut ) return SQLITE_NOMEM;
				278	}else{
				279	if( pMem->z==pMem->zShort ){
				280	zOut = zShort;
				281	}else{
				282	zOut = pMem->zShort;
				283	}
				284	}
				285	z = zOut;
				286
				287	if( pMem->enc==SQLITE_UTF8 ){
				288	if( desiredEnc==SQLITE_UTF16LE ){
				289	/* UTF-8 -> UTF-16 Little-endian */
				290	while( zIn<zTerm ){
				291	READ_UTF8(zIn, c);
				292	WRITE_UTF16LE(z, c);
				293	}
				294	WRITE_UTF16LE(z, 0);
				295	pMem->n = (z-zOut)-2;
				296	}else if( desiredEnc==SQLITE_UTF16BE ){
				297	/* UTF-8 -> UTF-16 Big-endian */
				298	while( zIn<zTerm ){
				299	READ_UTF8(zIn, c);
				300	WRITE_UTF16BE(z, c);
				301	}
				302	WRITE_UTF16BE(z, 0);
				303	pMem->n = (z-zOut)-2;
				304	}
				305	}else{
				306	assert( desiredEnc==SQLITE_UTF8 );
				307	if( pMem->enc==SQLITE_UTF16LE ){
				308	/* UTF-16 Little-endian -> UTF-8 */
				309	while( zIn<zTerm ){
				310	READ_UTF16LE(zIn, c);
				311	WRITE_UTF8(z, c);
				312	}
				313	WRITE_UTF8(z, 0);
				314	pMem->n = (z-zOut)-1;
				315	}else{
				316	/* UTF-16 Little-endian -> UTF-8 */
				317	while( zIn<zTerm ){
				318	READ_UTF16BE(zIn, c);
				319	WRITE_UTF8(z, c);
				320	}
				321	WRITE_UTF8(z, 0);
				322	pMem->n = (z-zOut)-1;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	323	}
				324	}
				325
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	326	sqlite3VdbeMemRelease(pMem);
				327	pMem->flags &= ~(MEM_Static\|MEM_Dyn\|MEM_Ephem\|MEM_Short);
				328	pMem->enc = desiredEnc;
				329	if( (char *)zOut==pMem->zShort ){
				330	pMem->flags \|= (MEM_Term\|MEM_Short);
				331	}else if( zOut==zShort ){
				332	memcpy(pMem->zShort, zOut, len);
				333	zOut = pMem->zShort;
				334	pMem->flags \|= (MEM_Term\|MEM_Short);
				335	}else{
				336	pMem->flags \|= (MEM_Term\|MEM_Dyn);
				337	}
				338	pMem->z = zOut;
				339
				340	translate_out:
				341	#ifdef TRANSLATE_TRACE
				342	{
				343	char zBuf[100];
				344	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				345	fprintf(stderr, "OUTPUT: %s\n", zBuf);
				346	}
				347	#endif
				348	return SQLITE_OK;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	349	}
				350
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	351	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	352	** This routine checks for a byte-order mark at the beginning of the
				353	** UTF-16 string stored in *pMem. If one is present, it is removed and
				354	** the encoding of the Mem adjusted. This routine does not do any
				355	** byte-swapping, it just sets Mem.enc appropriately.
				356	**
				357	** The allocation (static, dynamic etc.) and encoding of the Mem may be
				358	** changed by this function.
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	359	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	360	int sqlite3VdbeMemHandleBom(Mem *pMem){
				361	int rc = SQLITE_OK;
				362	u8 bom = 0;
				363
				364	if( pMem->n<0 \|\| pMem->n>1 ){
				365	u8 b1 = (u8 )pMem->z;
				366	u8 b2 = (((u8 )pMem->z) + 1);
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	367	if( b1==0xFE && b2==0xFF ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	368	bom = SQLITE_UTF16BE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	369	}
				370	if( b1==0xFF && b2==0xFE ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	371	bom = SQLITE_UTF16LE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	372	}
				373	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	374
				375	if( bom ){
				376	if( pMem->flags & MEM_Short ){
				377	memmove(pMem->zShort, &pMem->zShort[2], NBFS-2);
				378	pMem->n -= 2;
				379	pMem->enc = bom;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	380	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	381	else if( pMem->flags & MEM_Dyn ){
				382	void (xDel)(void) = pMem->xDel;
				383	char *z = pMem->z;
				384	pMem->z = 0;
				385	pMem->xDel = 0;
				386	rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
				387	if( xDel ){
				388	xDel(z);
				389	}else{
				390	sqliteFree(z);
				391	}
				392	}else{
				393	rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
				394	SQLITE_TRANSIENT);
				395	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	396	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	397	return rc;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	398	}
				399
				400	/*
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	401	** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
				402	** return the number of unicode characters in pZ up to (but not including)
				403	** the first 0x00 byte. If nByte is not less than zero, return the
				404	** number of unicode characters in the first nByte of pZ (or up to
				405	** the first 0x00, whichever comes first).
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	406	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	407	int sqlite3utf8CharLen(const char *z, int nByte){
				408	int r = 0;
				409	const char *zTerm;
				410	if( nByte>0 ){
				411	zTerm = &z[nByte];
				412	}else{
				413	zTerm = (const char *)(-1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	414	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	415	assert( z<=zTerm );
				416	while( *z!=0 && z<zTerm ){
				417	SKIP_UTF8(z);
				418	r++;
				419	}
				420	return r;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	421	}
				422
				423	/*
				424	** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
				425	** return the number of bytes up to (but not including), the first pair
				426	** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
				427	** then return the number of bytes in the first nChar unicode characters
				428	** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
				429	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	430	int sqlite3utf16ByteLen(const void *zIn, int nChar){
				431	int c = 1;
				432	char const *z = zIn;
				433	int n = 0;
				434	if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
				435	while( c && ((nChar<0) \|\| n<nChar) ){
				436	READ_UTF16BE(z, c);
				437	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	438	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	439	}else{
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	440	while( c && ((nChar<0) \|\| n<nChar) ){
				441	READ_UTF16LE(z, c);
				442	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	443	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	444	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	445	return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	446	}
				447
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	448	/*
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	449	** Compare two UTF-8 strings for equality using the "LIKE" operator of
				450	** SQL. The '%' character matches any sequence of 0 or more
				451	** characters and '_' matches any single character. Case is
				452	** not significant.
				453	*/
				454	int sqlite3utf8LikeCompare(
				455	const unsigned char *zPattern,
				456	const unsigned char *zString
				457	){
				458	register int c;
				459	int c2;
				460
				461	while( (c = LOWERCASE(*zPattern))!=0 ){
				462	switch( c ){
				463	case '%': {
				464	while( (c=zPattern[1]) == '%' \|\| c == '_' ){
				465	if( c=='_' ){
				466	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	467	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	468	}
				469	zPattern++;
				470	}
				471	if( c==0 ) return 1;
				472	c = LOWERCASE(c);
				473	while( (c2=LOWERCASE(*zString))!=0 ){
				474	while( c2 != 0 && c2 != c ){
				475	zString++;
				476	c2 = LOWERCASE(*zString);
				477	}
				478	if( c2==0 ) return 0;
				479	if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	480	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	481	}
				482	return 0;
				483	}
				484	case '_': {
				485	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	486	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	487	zPattern++;
				488	break;
				489	}
				490	default: {
				491	if( c != LOWERCASE(*zString) ) return 0;
				492	zPattern++;
				493	zString++;
				494	break;
				495	}
				496	}
				497	}
				498	return *zString==0;
				499	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame^]	500
				501	#ifndef NDEBUG
				502	/*
				503	** This routine is called from the TCL test function "translate_selftest".
				504	** It checks that the primitives for serializing and deserializing
				505	** characters in each encoding are inverses of each other.
				506	*/
				507	void sqlite3utfSelfTest(){
				508	int i;
				509	unsigned char zBuf[20];
				510	unsigned char *z;
				511	int n;
				512	int c;
				513
				514	for(i=0; 0 && i<0x00110000; i++){
				515	z = zBuf;
				516	WRITE_UTF8(z, i);
				517	n = z-zBuf;
				518	z = zBuf;
				519	READ_UTF8(z, c);
				520	assert( c==i );
				521	assert( (z-zBuf)==n );
				522	}
				523	for(i=0; i<0x00110000; i++){
				524	if( i>=0xD800 && i<=0xE000 ) continue;
				525	z = zBuf;
				526	WRITE_UTF16LE(z, i);
				527	n = z-zBuf;
				528	z = zBuf;
				529	READ_UTF16LE(z, c);
				530	assert( c==i );
				531	assert( (z-zBuf)==n );
				532	}
				533	for(i=0; i<0x00110000; i++){
				534	if( i>=0xD800 && i<=0xE000 ) continue;
				535	z = zBuf;
				536	WRITE_UTF16BE(z, i);
				537	n = z-zBuf;
				538	z = zBuf;
				539	READ_UTF16BE(z, c);
				540	assert( c==i );
				541	assert( (z-zBuf)==n );
				542	}
				543	}
				544	#endif
				545
				546