Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: 5fc37b5b5866f35cd7aead3ef8937f67bf88939c [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
danielk1977	f461889	2004-06-28 13:09:11 +0000	[diff] [blame^]	15	** $Id: utf.c,v 1.26 2004/06/28 13:09:11 danielk1977 Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
drh	51846b5	2004-05-28 16:00:21 +0000	[diff] [blame]	28	** Word-0 Word-1 Value
				29	** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				30	** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	51	**
				52	** This file contains the following public routines:
				53	**
				54	** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
				55	** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
				56	** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.
				57	** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.
				58	** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.
				59	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	60	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	61	#include <assert.h>
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	62	#include "sqliteInt.h"
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	63	#include "vdbeInt.h"
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	64
				65	/*
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	66	** The following macro, LOWERCASE(x), takes an integer representing a
				67	** unicode code point. The value returned is the same code point folded to
				68	** lower case, if applicable. SQLite currently understands the upper/lower
				69	** case relationship between the 26 characters used in the English
				70	** language only.
				71	**
				72	** This means that characters with umlauts etc. will not be folded
				73	** correctly (unless they are encoded as composite characters, which would
				74	** doubtless cause much trouble).
				75	*/
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	76	#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x)
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	77	static unsigned char UpperToLower[91] = {
				78	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
				79	18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
				80	36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
				81	54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
				82	104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
				83	122,
				84	};
				85
				86	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	87	** This table maps from the first byte of a UTF-8 character to the number
				88	** of trailing bytes expected. A value '255' indicates that the table key
				89	** is not a legal first byte for a UTF-8 character.
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	90	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	91	static const u8 xtra_utf8_bytes[256] = {
				92	/* 0xxxxxxx */
				93	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				94	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				95	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				96	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				97	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				98	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				99	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				100	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
danielk1977	d02eb1f	2004-06-06 09:44:03 +0000	[diff] [blame]	101
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	102	/* 10wwwwww */
				103	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				104	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				105	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
				106	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	107
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	108	/* 110yyyyy */
				109	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				110	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				111
				112	/* 1110zzzz */
				113	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				114
				115	/* 11110yyy */
				116	3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
				117	};
				118
				119	/*
				120	** This table maps from the number of trailing bytes in a UTF-8 character
				121	** to an integer constant that is effectively calculated for each character
				122	** read by a naive implementation of a UTF-8 character reader. The code
				123	** in the READ_UTF8 macro explains things best.
				124	*/
				125	static const int xtra_utf8_bits[4] = {
				126	0,
				127	12416, /* (0xC0 << 6) + (0x80) */
				128	925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
				129	63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
				130	};
				131
				132	#define READ_UTF8(zIn, c) { \
				133	int xtra; \
				134	c = *(zIn)++; \
				135	xtra = xtra_utf8_bytes[c]; \
				136	switch( xtra ){ \
				137	case 255: c = (int)0xFFFD; break; \
				138	case 3: c = (c<<6) + *(zIn)++; \
				139	case 2: c = (c<<6) + *(zIn)++; \
				140	case 1: c = (c<<6) + *(zIn)++; \
				141	c -= xtra_utf8_bits[xtra]; \
				142	} \
				143	}
				144
				145	#define SKIP_UTF8(zIn) { \
				146	zIn += (xtra_utf8_bytes[(u8 )zIn] + 1); \
				147	}
				148
				149	#define WRITE_UTF8(zOut, c) { \
				150	if( c<0x00080 ){ \
				151	*zOut++ = (c&0xFF); \
				152	} \
				153	else if( c<0x00800 ){ \
				154	*zOut++ = 0xC0 + ((c>>6)&0x1F); \
				155	*zOut++ = 0x80 + (c & 0x3F); \
				156	} \
				157	else if( c<0x10000 ){ \
				158	*zOut++ = 0xE0 + ((c>>12)&0x0F); \
				159	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				160	*zOut++ = 0x80 + (c & 0x3F); \
				161	}else{ \
				162	*zOut++ = 0xF0 + ((c>>18) & 0x07); \
				163	*zOut++ = 0x80 + ((c>>12) & 0x3F); \
				164	*zOut++ = 0x80 + ((c>>6) & 0x3F); \
				165	*zOut++ = 0x80 + (c & 0x3F); \
				166	} \
				167	}
				168
				169	#define WRITE_UTF16LE(zOut, c) { \
				170	if( c<=0xFFFF ){ \
				171	*zOut++ = (c&0x00FF); \
				172	*zOut++ = ((c>>8)&0x00FF); \
				173	}else{ \
				174	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				175	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				176	*zOut++ = (c&0x00FF); \
				177	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				178	} \
				179	}
				180
				181	#define WRITE_UTF16BE(zOut, c) { \
				182	if( c<=0xFFFF ){ \
				183	*zOut++ = ((c>>8)&0x00FF); \
				184	*zOut++ = (c&0x00FF); \
				185	}else{ \
				186	*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
				187	*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
				188	*zOut++ = (0x00DC + ((c>>8)&0x03)); \
				189	*zOut++ = (c&0x00FF); \
				190	} \
				191	}
				192
				193	#define READ_UTF16LE(zIn, c){ \
				194	c = (*zIn++); \
				195	c += ((*zIn++)<<8); \
				196	if( c>=0xD800 && c<=0xE000 ){ \
				197	int c2 = (*zIn++); \
				198	c2 += ((*zIn++)<<8); \
				199	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				200	} \
				201	}
				202
				203	#define READ_UTF16BE(zIn, c){ \
				204	c = ((*zIn++)<<8); \
				205	c += (*zIn++); \
				206	if( c>=0xD800 && c<=0xE000 ){ \
				207	int c2 = ((*zIn++)<<8); \
				208	c2 += (*zIn++); \
				209	c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
				210	} \
				211	}
				212
danielk1977	f461889	2004-06-28 13:09:11 +0000	[diff] [blame^]	213	#define SKIP_UTF16BE(zIn){ \
				214	if( zIn>=0xD8 && (zIn<0xE0 \|\| (zIn==0xE0 && (zIn+1)==0x00)) ){ \
				215	zIn += 4; \
				216	}else{ \
				217	zIn += 2; \
				218	} \
				219	}
				220	#define SKIP_UTF16LE(zIn){ \
				221	zIn++; \
				222	if( zIn>=0xD8 && (zIn<0xE0 \|\| (zIn==0xE0 && (zIn-1)==0x00)) ){ \
				223	zIn += 3; \
				224	}else{ \
				225	zIn += 1; \
				226	} \
				227	}
				228
				229	#define RSKIP_UTF16LE(zIn){ \
				230	if( zIn>=0xD8 && (zIn<0xE0 \|\| (zIn==0xE0 && (zIn-1)==0x00)) ){ \
				231	zIn -= 4; \
				232	}else{ \
				233	zIn -= 2; \
				234	} \
				235	}
				236	#define RSKIP_UTF16BE(zIn){ \
				237	zIn--; \
				238	if( zIn>=0xD8 && (zIn<0xE0 \|\| (zIn==0xE0 && (zIn+1)==0x00)) ){ \
				239	zIn -= 3; \
				240	}else{ \
				241	zIn -= 1; \
				242	} \
				243	}
				244
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	245	/*
				246	** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
				247	** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
				248	*/
				249	/* #define TRANSLATE_TRACE 1 */
				250
				251	/*
				252	** This routine transforms the internal text encoding used by pMem to
				253	** desiredEnc. It is an error if the string is already of the desired
				254	** encoding, or if *pMem does not contain a string value.
				255	*/
				256	int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
				257	unsigned char zShort[NBFS]; /* Temporary short output buffer */
				258	int len; /* Maximum length of output string in bytes */
				259	unsigned char zOut; / Output buffer */
				260	unsigned char zIn; / Input iterator */
				261	unsigned char zTerm; / End of input */
				262	unsigned char z; / Output iterator */
				263	int c;
				264
				265	assert( pMem->flags&MEM_Str );
				266	assert( pMem->enc!=desiredEnc );
				267	assert( pMem->enc!=0 );
				268	assert( pMem->n>=0 );
				269
				270	#ifdef TRANSLATE_TRACE
				271	{
				272	char zBuf[100];
				273	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				274	fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977	ad7dd42	2004-06-06 12:41:49 +0000	[diff] [blame]	275	}
				276	#endif
				277
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	278	/* If the translation is between UTF-16 little and big endian, then
				279	** all that is required is to swap the byte order. This case is handled
				280	** differently from the others.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	281	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	282	if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
				283	u8 temp;
				284	sqlite3VdbeMemMakeWriteable(pMem);
				285	zIn = pMem->z;
				286	zTerm = &zIn[pMem->n];
				287	while( zIn<zTerm ){
				288	temp = *zIn;
				289	zIn = (zIn+1);
				290	zIn++;
				291	*zIn++ = temp;
				292	}
				293	pMem->enc = desiredEnc;
				294	goto translate_out;
				295	}
				296
danielk1977	d7e6964	2004-06-23 00:23:49 +0000	[diff] [blame]	297	/* Set len to the maximum number of bytes required in the output buffer. */
				298	if( desiredEnc==SQLITE_UTF8 ){
				299	/* When converting from UTF-16, the maximum growth results from
				300	** translating a 2-byte character to a 3-byte UTF-8 character (i.e.
				301	** code-point 0xFFFC). A single byte is required for the output string
				302	** nul-terminator.
				303	*/
				304	len = (pMem->n/2) * 3 + 1;
				305	}else{
				306	/* When converting from UTF-8 to UTF-16 the maximum growth is caused
				307	** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
				308	** character. Two bytes are required in the output buffer for the
				309	** nul-terminator.
				310	*/
				311	len = pMem->n * 2 + 2;
				312	}
				313
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	314	/* Set zIn to point at the start of the input buffer and zTerm to point 1
				315	** byte past the end.
				316	**
				317	** Variable zOut is set to point at the output buffer. This may be space
				318	** obtained from malloc(), or Mem.zShort, if it large enough and not in
				319	** use, or the zShort array on the stack (see above).
				320	*/
				321	zIn = pMem->z;
				322	zTerm = &zIn[pMem->n];
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	323	if( len>NBFS ){
				324	zOut = sqliteMallocRaw(len);
				325	if( !zOut ) return SQLITE_NOMEM;
				326	}else{
danielk1977	1ba1b55	2004-06-23 13:46:32 +0000	[diff] [blame]	327	zOut = zShort;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	328	}
				329	z = zOut;
				330
				331	if( pMem->enc==SQLITE_UTF8 ){
				332	if( desiredEnc==SQLITE_UTF16LE ){
				333	/* UTF-8 -> UTF-16 Little-endian */
				334	while( zIn<zTerm ){
				335	READ_UTF8(zIn, c);
				336	WRITE_UTF16LE(z, c);
				337	}
				338	WRITE_UTF16LE(z, 0);
				339	pMem->n = (z-zOut)-2;
				340	}else if( desiredEnc==SQLITE_UTF16BE ){
				341	/* UTF-8 -> UTF-16 Big-endian */
				342	while( zIn<zTerm ){
				343	READ_UTF8(zIn, c);
				344	WRITE_UTF16BE(z, c);
				345	}
				346	WRITE_UTF16BE(z, 0);
				347	pMem->n = (z-zOut)-2;
				348	}
				349	}else{
				350	assert( desiredEnc==SQLITE_UTF8 );
				351	if( pMem->enc==SQLITE_UTF16LE ){
				352	/* UTF-16 Little-endian -> UTF-8 */
				353	while( zIn<zTerm ){
				354	READ_UTF16LE(zIn, c);
				355	WRITE_UTF8(z, c);
				356	}
				357	WRITE_UTF8(z, 0);
				358	pMem->n = (z-zOut)-1;
				359	}else{
				360	/* UTF-16 Little-endian -> UTF-8 */
				361	while( zIn<zTerm ){
				362	READ_UTF16BE(zIn, c);
				363	WRITE_UTF8(z, c);
				364	}
				365	WRITE_UTF8(z, 0);
				366	pMem->n = (z-zOut)-1;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	367	}
				368	}
danielk1977	d7e6964	2004-06-23 00:23:49 +0000	[diff] [blame]	369	assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	370
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	371	sqlite3VdbeMemRelease(pMem);
				372	pMem->flags &= ~(MEM_Static\|MEM_Dyn\|MEM_Ephem\|MEM_Short);
				373	pMem->enc = desiredEnc;
danielk1977	1ba1b55	2004-06-23 13:46:32 +0000	[diff] [blame]	374	if( zOut==zShort ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	375	memcpy(pMem->zShort, zOut, len);
				376	zOut = pMem->zShort;
				377	pMem->flags \|= (MEM_Term\|MEM_Short);
				378	}else{
				379	pMem->flags \|= (MEM_Term\|MEM_Dyn);
				380	}
				381	pMem->z = zOut;
				382
				383	translate_out:
				384	#ifdef TRANSLATE_TRACE
				385	{
				386	char zBuf[100];
				387	sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
				388	fprintf(stderr, "OUTPUT: %s\n", zBuf);
				389	}
				390	#endif
				391	return SQLITE_OK;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	392	}
				393
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	394	/*
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	395	** This routine checks for a byte-order mark at the beginning of the
				396	** UTF-16 string stored in *pMem. If one is present, it is removed and
				397	** the encoding of the Mem adjusted. This routine does not do any
				398	** byte-swapping, it just sets Mem.enc appropriately.
				399	**
				400	** The allocation (static, dynamic etc.) and encoding of the Mem may be
				401	** changed by this function.
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	402	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	403	int sqlite3VdbeMemHandleBom(Mem *pMem){
				404	int rc = SQLITE_OK;
				405	u8 bom = 0;
				406
				407	if( pMem->n<0 \|\| pMem->n>1 ){
				408	u8 b1 = (u8 )pMem->z;
				409	u8 b2 = (((u8 )pMem->z) + 1);
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	410	if( b1==0xFE && b2==0xFF ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	411	bom = SQLITE_UTF16BE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	412	}
				413	if( b1==0xFF && b2==0xFE ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	414	bom = SQLITE_UTF16LE;
danielk1977	93d4675	2004-05-23 13:30:58 +0000	[diff] [blame]	415	}
				416	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	417
				418	if( bom ){
danielk1977	1ba1b55	2004-06-23 13:46:32 +0000	[diff] [blame]	419	/* This function is called as soon as a string is stored in a Mem*,
				420	** from within sqlite3VdbeMemSetStr(). At that point it is not possible
				421	** for the string to be stored in Mem.zShort, or for it to be stored
				422	** in dynamic memory with no destructor.
				423	*/
				424	assert( !(pMem->flags&MEM_Short) );
				425	assert( !(pMem->flags&MEM_Dyn) \|\| pMem->xDel );
				426	if( pMem->flags & MEM_Dyn ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	427	void (xDel)(void) = pMem->xDel;
				428	char *z = pMem->z;
				429	pMem->z = 0;
				430	pMem->xDel = 0;
				431	rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
danielk1977	1ba1b55	2004-06-23 13:46:32 +0000	[diff] [blame]	432	xDel(z);
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	433	}else{
				434	rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
				435	SQLITE_TRANSIENT);
				436	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	437	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	438	return rc;
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	439	}
				440
				441	/*
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	442	** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
				443	** return the number of unicode characters in pZ up to (but not including)
				444	** the first 0x00 byte. If nByte is not less than zero, return the
				445	** number of unicode characters in the first nByte of pZ (or up to
				446	** the first 0x00, whichever comes first).
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	447	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	448	int sqlite3utf8CharLen(const char *z, int nByte){
				449	int r = 0;
				450	const char *zTerm;
danielk1977	1ba1b55	2004-06-23 13:46:32 +0000	[diff] [blame]	451	if( nByte>=0 ){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	452	zTerm = &z[nByte];
				453	}else{
				454	zTerm = (const char *)(-1);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	455	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	456	assert( z<=zTerm );
				457	while( *z!=0 && z<zTerm ){
				458	SKIP_UTF8(z);
				459	r++;
				460	}
				461	return r;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	462	}
				463
				464	/*
				465	** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
				466	** return the number of bytes up to (but not including), the first pair
				467	** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
				468	** then return the number of bytes in the first nChar unicode characters
				469	** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
				470	*/
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	471	int sqlite3utf16ByteLen(const void *zIn, int nChar){
				472	int c = 1;
				473	char const *z = zIn;
				474	int n = 0;
				475	if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
				476	while( c && ((nChar<0) \|\| n<nChar) ){
				477	READ_UTF16BE(z, c);
				478	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	479	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	480	}else{
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	481	while( c && ((nChar<0) \|\| n<nChar) ){
				482	READ_UTF16LE(z, c);
				483	n++;
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	484	}
danielk1977	6622cce	2004-05-20 11:00:52 +0000	[diff] [blame]	485	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	486	return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame]	487	}
				488
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	489	/*
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	490	** Compare two UTF-8 strings for equality using the "LIKE" operator of
				491	** SQL. The '%' character matches any sequence of 0 or more
				492	** characters and '_' matches any single character. Case is
				493	** not significant.
				494	*/
				495	int sqlite3utf8LikeCompare(
				496	const unsigned char *zPattern,
				497	const unsigned char *zString
				498	){
				499	register int c;
				500	int c2;
				501
				502	while( (c = LOWERCASE(*zPattern))!=0 ){
				503	switch( c ){
				504	case '%': {
				505	while( (c=zPattern[1]) == '%' \|\| c == '_' ){
				506	if( c=='_' ){
				507	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	508	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	509	}
				510	zPattern++;
				511	}
				512	if( c==0 ) return 1;
				513	c = LOWERCASE(c);
				514	while( (c2=LOWERCASE(*zString))!=0 ){
				515	while( c2 != 0 && c2 != c ){
				516	zString++;
				517	c2 = LOWERCASE(*zString);
				518	}
				519	if( c2==0 ) return 0;
				520	if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	521	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	522	}
				523	return 0;
				524	}
				525	case '_': {
				526	if( *zString==0 ) return 0;
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	527	SKIP_UTF8(zString);
danielk1977	3f6b087	2004-06-17 05:36:44 +0000	[diff] [blame]	528	zPattern++;
				529	break;
				530	}
				531	default: {
				532	if( c != LOWERCASE(*zString) ) return 0;
				533	zPattern++;
				534	zString++;
				535	break;
				536	}
				537	}
				538	}
				539	return *zString==0;
				540	}
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	541
danielk1977	f461889	2004-06-28 13:09:11 +0000	[diff] [blame^]	542	/*
				543	** UTF-16 implementation of the substr()
				544	*/
				545	void sqlite3utf16Substr(
				546	sqlite3_context *context,
				547	int argc,
				548	sqlite3_value **argv
				549	){
				550	int y, z;
				551	unsigned char const *zStr;
				552	unsigned char const *zStrEnd;
				553	unsigned char const *zStart;
				554	unsigned char const *zEnd;
				555	int i;
				556
				557	zStr = (unsigned char const *)sqlite3_value_text16(argv[0]);
				558	zStrEnd = &zStr[sqlite3_value_bytes16(argv[0])];
				559	y = sqlite3_value_int(argv[1]);
				560	z = sqlite3_value_int(argv[2]);
				561
				562	if( y>0 ){
				563	y = y-1;
				564	zStart = zStr;
				565	if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
				566	for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16BE(zStart);
				567	}else{
				568	for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16LE(zStart);
				569	}
				570	}else{
				571	zStart = zStrEnd;
				572	if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
				573	for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16BE(zStart);
				574	}else{
				575	for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16LE(zStart);
				576	}
				577	for(; i<0; i++) z -= 1;
				578	}
				579
				580	zEnd = zStart;
				581	if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
				582	for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16BE(zEnd);
				583	}else{
				584	for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16LE(zEnd);
				585	}
				586
				587	sqlite3_result_text16(context, zStart, zEnd-zStart, SQLITE_TRANSIENT);
				588	}
				589
drh	38f8271	2004-06-18 17:10:16 +0000	[diff] [blame]	590	#if defined(SQLITE_TEST)
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	591	/*
				592	** This routine is called from the TCL test function "translate_selftest".
				593	** It checks that the primitives for serializing and deserializing
				594	** characters in each encoding are inverses of each other.
				595	*/
				596	void sqlite3utfSelfTest(){
				597	int i;
				598	unsigned char zBuf[20];
				599	unsigned char *z;
				600	int n;
				601	int c;
				602
danielk1977	1ba1b55	2004-06-23 13:46:32 +0000	[diff] [blame]	603	for(i=0; i<0x00110000; i++){
danielk1977	bfd6cce	2004-06-18 04:24:54 +0000	[diff] [blame]	604	z = zBuf;
				605	WRITE_UTF8(z, i);
				606	n = z-zBuf;
				607	z = zBuf;
				608	READ_UTF8(z, c);
				609	assert( c==i );
				610	assert( (z-zBuf)==n );
				611	}
				612	for(i=0; i<0x00110000; i++){
				613	if( i>=0xD800 && i<=0xE000 ) continue;
				614	z = zBuf;
				615	WRITE_UTF16LE(z, i);
				616	n = z-zBuf;
				617	z = zBuf;
				618	READ_UTF16LE(z, c);
				619	assert( c==i );
				620	assert( (z-zBuf)==n );
				621	}
				622	for(i=0; i<0x00110000; i++){
				623	if( i>=0xD800 && i<=0xE000 ) continue;
				624	z = zBuf;
				625	WRITE_UTF16BE(z, i);
				626	n = z-zBuf;
				627	z = zBuf;
				628	READ_UTF16BE(z, c);
				629	assert( c==i );
				630	assert( (z-zBuf)==n );
				631	}
				632	}
				633	#endif