Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: 16454c2ffd8a3555f8dd4392a79f8b76583e4024 [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
drh	1b743be	2004-06-22 22:04:46 +0000	[diff] [blame]	15	** $Id: utf.c,v 1.23 2004/06/22 22:04:46 drh Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
drh	51846b5	2004-05-28 16:00:21 +0000	[diff] [blame]	28	** Word-0 Word-1 Value
				29	** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				30	** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	51	**
				52	** This file contains the following public routines:
				53	**
				54	** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
				55	** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
				56	** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.
				57	** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.
				58	** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.
				59	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	60	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	61	#include <assert.h>
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	62	#include "sqliteInt.h"
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	63	#include "vdbeInt.h"
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	64
				65	/*
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	66	** The following macro, LOWERCASE(x), takes an integer representing a
				67	** unicode code point. The value returned is the same code point folded to
				68	** lower case, if applicable. SQLite currently understands the upper/lower
				69	** case relationship between the 26 characters used in the English
				70	** language only.
				71	**
				72	** This means that characters with umlauts etc. will not be folded
				73	** correctly (unless they are encoded as composite characters, which would
				74	** doubtless cause much trouble).
				75	*/
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	76	#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x)
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	77	static unsigned char UpperToLower[91] = {
				78	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
				79	18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
				80	36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
				81	54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
				82	104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
				83	122,
				84	};
				85
				86	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	87	** This table maps from the first byte of a UTF-8 character to the number
				88	** of trailing bytes expected. A value '255' indicates that the table key
				89	** is not a legal first byte for a UTF-8 character.
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	90	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	91	static const u8 xtra_utf8_bytes[256] = {
				92	/* 0xxxxxxx */
				93	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				94	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				95	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				96	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				97	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				98	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				99	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				100	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	101
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	102	/* 10wwwwww */
				103	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				104	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				105	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				106	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	107
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	108	/* 110yyyyy */
				109	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				110	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				111
				112	/* 1110zzzz */
				113	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				114
				115	/* 11110yyy */
				116	3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
				117	};
				118
				119	/*
				120	** This table maps from the number of trailing bytes in a UTF-8 character
				121	** to an integer constant that is effectively calculated for each character
				122	** read by a naive implementation of a UTF-8 character reader. The code
				123	** in the READ_UTF8 macro explains things best.
				124	*/
				125	static const int xtra_utf8_bits[4] = {
				126	0,
				127	12416, /* (0xC0 << 6) + (0x80) */
				128	925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
				129	63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
				130	};
				131
				132	#define READ_UTF8(zIn, c) { \
				133	int xtra; \
				134	c = *(zIn)++; \
				135	xtra = xtra_utf8_bytes[c]; \
				136	switch( xtra ){ \
				137	case 255: c = (int)0xFFFD; break; \
				138	case 3: c = (c<<6) + *(zIn)++; \
				139	case 2: c = (c<<6) + *(zIn)++; \
				140	case 1: c = (c<<6) + *(zIn)++; \
				141	c -= xtra_utf8_bits[xtra]; \
				142	} \
				143	}
				144
				145	#define SKIP_UTF8(zIn) { \
				146	zIn += (xtra_utf8_bytes[(u8 )zIn] + 1); \
				147	}
				148
				149	#define WRITE_UTF8(zOut, c) { \
				150	if( c<0x00080 ){ \
				151	*zOut++ = (c&0xFF); \
				152	} \
				153	else if( c<0x00800 ){ \
				154	*zOut++ = 0xC0 + ((c>>6)&0x1F); \
				155	*zOut++ = 0x80 + (c & 0x3F); \
				156	} \
				157	else if( c<0x10000 ){ \
				158	*zOut++ = 0xE0 + ((c>>12)&0x0F); \
				159	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				160	*zOut++ = 0x80 + (c & 0x3F); \
				161	}else{ \
				162	*zOut++ = 0xF0 + ((c>>18) & 0x07); \
				163	*zOut++ = 0x80 + ((c>>12) & 0x3F); \
				164	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				165	*zOut++ = 0x80 + (c & 0x3F); \
				166	} \
				167	}
				168
				169	#define WRITE_UTF16LE(zOut, c) { \
				170	if( c<=0xFFFF ){ \
				171	*zOut++ = (c&0x00FF); \
				172	*zOut++ = ((c>>8)&0x00FF); \
				173	}else{ \
				174	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				175	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				176	*zOut++ = (c&0x00FF); \
				177	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				178	} \
				179	}
				180
				181	#define WRITE_UTF16BE(zOut, c) { \
				182	if( c<=0xFFFF ){ \
				183	*zOut++ = ((c>>8)&0x00FF); \
				184	*zOut++ = (c&0x00FF); \
				185	}else{ \
				186	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				187	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				188	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				189	*zOut++ = (c&0x00FF); \
				190	} \
				191	}
				192
				193	#define READ_UTF16LE(zIn, c){ \
				194	c = (*zIn++); \
				195	c += ((*zIn++)<<8); \
				196	if( c>=0xD800 && c<=0xE000 ){ \
				197	int c2 = (*zIn++); \
				198	c2 += ((*zIn++)<<8); \
				199	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				200	} \
				201	}
				202
				203	#define READ_UTF16BE(zIn, c){ \
				204	c = ((*zIn++)<<8); \
				205	c += (*zIn++); \
				206	if( c>=0xD800 && c<=0xE000 ){ \
				207	int c2 = ((*zIn++)<<8); \
				208	c2 += (*zIn++); \
				209	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				210	} \
				211	}
				212
				213	/*
				214	** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
				215	** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
				216	*/
				217	/* #define TRANSLATE_TRACE 1 */
				218
				219	/*
				220	** This routine transforms the internal text encoding used by pMem to
				221	** desiredEnc. It is an error if the string is already of the desired
				222	** encoding, or if *pMem does not contain a string value.
				223	*/
				224	int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
				225	unsigned char zShort[NBFS]; /* Temporary short output buffer */
				226	int len; /* Maximum length of output string in bytes */
				227	unsigned char zOut; / Output buffer */
				228	unsigned char zIn; / Input iterator */
				229	unsigned char zTerm; / End of input */
				230	unsigned char z; / Output iterator */
				231	int c;
				232
				233	assert( pMem->flags&MEM_Str );
				234	assert( pMem->enc!=desiredEnc );
				235	assert( pMem->enc!=0 );
				236	assert( pMem->n>=0 );
				237
				238	#ifdef TRANSLATE_TRACE
				239	{
				240	char zBuf[100];
				241	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				242	fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	243	}
				244	#endif
				245
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	246	/* If the translation is between UTF-16 little and big endian, then
				247	** all that is required is to swap the byte order. This case is handled
				248	** differently from the others.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	249	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	250	if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
				251	u8 temp;
				252	sqlite3VdbeMemMakeWriteable(pMem);
				253	zIn = pMem->z;
				254	zTerm = &zIn[pMem->n];
				255	while( zIn<zTerm ){
				256	temp = *zIn;
				257	zIn = (zIn+1);
				258	zIn++;
				259	*zIn++ = temp;
				260	}
				261	pMem->enc = desiredEnc;
				262	goto translate_out;
				263	}
				264
				265	/* Set zIn to point at the start of the input buffer and zTerm to point 1
				266	** byte past the end.
				267	**
				268	** Variable zOut is set to point at the output buffer. This may be space
				269	** obtained from malloc(), or Mem.zShort, if it large enough and not in
				270	** use, or the zShort array on the stack (see above).
				271	*/
				272	zIn = pMem->z;
				273	zTerm = &zIn[pMem->n];
				274	len = pMem->n*2 + 2;
				275	if( len>NBFS ){
				276	zOut = sqliteMallocRaw(len);
				277	if( !zOut ) return SQLITE_NOMEM;
				278	}else{
				279	if( pMem->z==pMem->zShort ){
				280	zOut = zShort;
				281	}else{
				282	zOut = pMem->zShort;
				283	}
				284	}
				285	z = zOut;
				286
				287	if( pMem->enc==SQLITE_UTF8 ){
				288	if( desiredEnc==SQLITE_UTF16LE ){
				289	/* UTF-8 -> UTF-16 Little-endian */
				290	while( zIn<zTerm ){
				291	READ_UTF8(zIn, c);
				292	WRITE_UTF16LE(z, c);
				293	}
				294	WRITE_UTF16LE(z, 0);
				295	pMem->n = (z-zOut)-2;
				296	}else if( desiredEnc==SQLITE_UTF16BE ){
				297	/* UTF-8 -> UTF-16 Big-endian */
				298	while( zIn<zTerm ){
				299	READ_UTF8(zIn, c);
				300	WRITE_UTF16BE(z, c);
				301	}
				302	WRITE_UTF16BE(z, 0);
				303	pMem->n = (z-zOut)-2;
				304	}
				305	}else{
				306	assert( desiredEnc==SQLITE_UTF8 );
				307	if( pMem->enc==SQLITE_UTF16LE ){
				308	/* UTF-16 Little-endian -> UTF-8 */
				309	while( zIn<zTerm ){
				310	READ_UTF16LE(zIn, c);
				311	WRITE_UTF8(z, c);
				312	}
				313	WRITE_UTF8(z, 0);
				314	pMem->n = (z-zOut)-1;
				315	}else{
				316	/* UTF-16 Little-endian -> UTF-8 */
				317	while( zIn<zTerm ){
				318	READ_UTF16BE(zIn, c);
				319	WRITE_UTF8(z, c);
				320	}
				321	WRITE_UTF8(z, 0);
				322	pMem->n = (z-zOut)-1;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	323	}
drh	1b743be	2004-06-22 22:04:46 +0000	[diff] [blame]	324	assert( pMem->n+1<=len );
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	325	}
				326
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	327	sqlite3VdbeMemRelease(pMem);
				328	pMem->flags &= ~(MEM_Static\|MEM_Dyn\|MEM_Ephem\|MEM_Short);
				329	pMem->enc = desiredEnc;
				330	if( (char *)zOut==pMem->zShort ){
				331	pMem->flags \|= (MEM_Term\|MEM_Short);
				332	}else if( zOut==zShort ){
				333	memcpy(pMem->zShort, zOut, len);
				334	zOut = pMem->zShort;
				335	pMem->flags \|= (MEM_Term\|MEM_Short);
				336	}else{
				337	pMem->flags \|= (MEM_Term\|MEM_Dyn);
				338	}
				339	pMem->z = zOut;
				340
				341	translate_out:
				342	#ifdef TRANSLATE_TRACE
				343	{
				344	char zBuf[100];
				345	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				346	fprintf(stderr, "OUTPUT: %s\n", zBuf);
				347	}
				348	#endif
				349	return SQLITE_OK;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	350	}
				351
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	352	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	353	** This routine checks for a byte-order mark at the beginning of the
				354	** UTF-16 string stored in *pMem. If one is present, it is removed and
				355	** the encoding of the Mem adjusted. This routine does not do any
				356	** byte-swapping, it just sets Mem.enc appropriately.
				357	**
				358	** The allocation (static, dynamic etc.) and encoding of the Mem may be
				359	** changed by this function.
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	360	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	361	int sqlite3VdbeMemHandleBom(Mem *pMem){
				362	int rc = SQLITE_OK;
				363	u8 bom = 0;
				364
				365	if( pMem->n<0 \|\| pMem->n>1 ){
				366	u8 b1 = (u8 )pMem->z;
				367	u8 b2 = (((u8 )pMem->z) + 1);
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	368	if( b1==0xFE && b2==0xFF ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	369	bom = SQLITE_UTF16BE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	370	}
				371	if( b1==0xFF && b2==0xFE ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	372	bom = SQLITE_UTF16LE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	373	}
				374	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	375
				376	if( bom ){
				377	if( pMem->flags & MEM_Short ){
				378	memmove(pMem->zShort, &pMem->zShort[2], NBFS-2);
				379	pMem->n -= 2;
				380	pMem->enc = bom;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	381	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	382	else if( pMem->flags & MEM_Dyn ){
				383	void (xDel)(void) = pMem->xDel;
				384	char *z = pMem->z;
				385	pMem->z = 0;
				386	pMem->xDel = 0;
				387	rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
				388	if( xDel ){
				389	xDel(z);
				390	}else{
				391	sqliteFree(z);
				392	}
				393	}else{
				394	rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
				395	SQLITE_TRANSIENT);
				396	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	397	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	398	return rc;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	399	}
				400
				401	/*
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	402	** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
				403	** return the number of unicode characters in pZ up to (but not including)
				404	** the first 0x00 byte. If nByte is not less than zero, return the
				405	** number of unicode characters in the first nByte of pZ (or up to
				406	** the first 0x00, whichever comes first).
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	407	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	408	int sqlite3utf8CharLen(const char *z, int nByte){
				409	int r = 0;
				410	const char *zTerm;
				411	if( nByte>0 ){
				412	zTerm = &z[nByte];
				413	}else{
				414	zTerm = (const char *)(-1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	415	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	416	assert( z<=zTerm );
				417	while( *z!=0 && z<zTerm ){
				418	SKIP_UTF8(z);
				419	r++;
				420	}
				421	return r;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	422	}
				423
				424	/*
				425	** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
				426	** return the number of bytes up to (but not including), the first pair
				427	** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
				428	** then return the number of bytes in the first nChar unicode characters
				429	** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
				430	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	431	int sqlite3utf16ByteLen(const void *zIn, int nChar){
				432	int c = 1;
				433	char const *z = zIn;
				434	int n = 0;
				435	if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
				436	while( c && ((nChar<0) \|\| n<nChar) ){
				437	READ_UTF16BE(z, c);
				438	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	439	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	440	}else{
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	441	while( c && ((nChar<0) \|\| n<nChar) ){
				442	READ_UTF16LE(z, c);
				443	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	444	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	445	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	446	return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	447	}
				448
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	449	/*
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	450	** Compare two UTF-8 strings for equality using the "LIKE" operator of
				451	** SQL. The '%' character matches any sequence of 0 or more
				452	** characters and '_' matches any single character. Case is
				453	** not significant.
				454	*/
				455	int sqlite3utf8LikeCompare(
				456	const unsigned char *zPattern,
				457	const unsigned char *zString
				458	){
				459	register int c;
				460	int c2;
				461
				462	while( (c = LOWERCASE(*zPattern))!=0 ){
				463	switch( c ){
				464	case '%': {
				465	while( (c=zPattern[1]) == '%' \|\| c == '_' ){
				466	if( c=='_' ){
				467	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	468	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	469	}
				470	zPattern++;
				471	}
				472	if( c==0 ) return 1;
				473	c = LOWERCASE(c);
				474	while( (c2=LOWERCASE(*zString))!=0 ){
				475	while( c2 != 0 && c2 != c ){
				476	zString++;
				477	c2 = LOWERCASE(*zString);
				478	}
				479	if( c2==0 ) return 0;
				480	if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	481	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	482	}
				483	return 0;
				484	}
				485	case '_': {
				486	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	487	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	488	zPattern++;
				489	break;
				490	}
				491	default: {
				492	if( c != LOWERCASE(*zString) ) return 0;
				493	zPattern++;
				494	zString++;
				495	break;
				496	}
				497	}
				498	}
				499	return *zString==0;
				500	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	501
drh	38f8271	2004-06-18 17:10:16 +0000	[diff] [blame]	502	#if defined(SQLITE_TEST)
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	503	/*
				504	** This routine is called from the TCL test function "translate_selftest".
				505	** It checks that the primitives for serializing and deserializing
				506	** characters in each encoding are inverses of each other.
				507	*/
				508	void sqlite3utfSelfTest(){
				509	int i;
				510	unsigned char zBuf[20];
				511	unsigned char *z;
				512	int n;
				513	int c;
				514
				515	for(i=0; 0 && i<0x00110000; i++){
				516	z = zBuf;
				517	WRITE_UTF8(z, i);
				518	n = z-zBuf;
				519	z = zBuf;
				520	READ_UTF8(z, c);
				521	assert( c==i );
				522	assert( (z-zBuf)==n );
				523	}
				524	for(i=0; i<0x00110000; i++){
				525	if( i>=0xD800 && i<=0xE000 ) continue;
				526	z = zBuf;
				527	WRITE_UTF16LE(z, i);
				528	n = z-zBuf;
				529	z = zBuf;
				530	READ_UTF16LE(z, c);
				531	assert( c==i );
				532	assert( (z-zBuf)==n );
				533	}
				534	for(i=0; i<0x00110000; i++){
				535	if( i>=0xD800 && i<=0xE000 ) continue;
				536	z = zBuf;
				537	WRITE_UTF16BE(z, i);
				538	n = z-zBuf;
				539	z = zBuf;
				540	READ_UTF16BE(z, c);
				541	assert( c==i );
				542	assert( (z-zBuf)==n );
				543	}
				544	}
				545	#endif