blob: 5fc37b5b5866f35cd7aead3ef8937f67bf88939c [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
danielk1977f4618892004-06-28 13:09:11 +000015** $Id: utf.c,v 1.26 2004/06/28 13:09:11 danielk1977 Exp $
drha5d14fe2004-05-04 15:00:46 +000016**
17** Notes on UTF-8:
18**
19** Byte-0 Byte-1 Byte-2 Byte-3 Value
20** 0xxxxxxx 00000000 00000000 0xxxxxxx
21** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24**
25**
26** Notes on UTF-16: (with wwww+1==uuuuu)
27**
drh51846b52004-05-28 16:00:21 +000028** Word-0 Word-1 Value
29** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
30** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000031**
danielk1977998b56c2004-05-06 23:37:52 +000032**
drha5d14fe2004-05-04 15:00:46 +000033** BOM or Byte Order Mark:
34** 0xff 0xfe little-endian utf-16 follows
35** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000036**
37**
38** Handling of malformed strings:
39**
40** SQLite accepts and processes malformed strings without an error wherever
41** possible. However this is not possible when converting between UTF-8 and
42** UTF-16.
43**
44** When converting malformed UTF-8 strings to UTF-16, one instance of the
45** replacement character U+FFFD for each byte that cannot be interpeted as
46** part of a valid unicode character.
47**
48** When converting malformed UTF-16 strings to UTF-8, one instance of the
49** replacement character U+FFFD for each pair of bytes that cannot be
50** interpeted as part of a valid unicode character.
danielk1977bfd6cce2004-06-18 04:24:54 +000051**
52** This file contains the following public routines:
53**
54** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
55** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
56** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.
57** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.
58** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.
59**
drha5d14fe2004-05-04 15:00:46 +000060*/
danielk1977998b56c2004-05-06 23:37:52 +000061#include <assert.h>
danielk1977998b56c2004-05-06 23:37:52 +000062#include "sqliteInt.h"
danielk1977bfd6cce2004-06-18 04:24:54 +000063#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000064
65/*
danielk1977d02eb1f2004-06-06 09:44:03 +000066** The following macro, LOWERCASE(x), takes an integer representing a
67** unicode code point. The value returned is the same code point folded to
68** lower case, if applicable. SQLite currently understands the upper/lower
69** case relationship between the 26 characters used in the English
70** language only.
71**
72** This means that characters with umlauts etc. will not be folded
73** correctly (unless they are encoded as composite characters, which would
74** doubtless cause much trouble).
75*/
danielk19773f6b0872004-06-17 05:36:44 +000076#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x)
danielk1977d02eb1f2004-06-06 09:44:03 +000077static unsigned char UpperToLower[91] = {
78 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
79 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
80 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
81 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
82 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
83 122,
84};
85
86/*
danielk1977bfd6cce2004-06-18 04:24:54 +000087** This table maps from the first byte of a UTF-8 character to the number
88** of trailing bytes expected. A value '255' indicates that the table key
89** is not a legal first byte for a UTF-8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000090*/
danielk1977bfd6cce2004-06-18 04:24:54 +000091static const u8 xtra_utf8_bytes[256] = {
92/* 0xxxxxxx */
930, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
950, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
980, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
990, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
danielk1977d02eb1f2004-06-06 09:44:03 +0000101
danielk1977bfd6cce2004-06-18 04:24:54 +0000102/* 10wwwwww */
103255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
104255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
105255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
106255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
danielk1977ad7dd422004-06-06 12:41:49 +0000107
danielk1977bfd6cce2004-06-18 04:24:54 +0000108/* 110yyyyy */
1091, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1101, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111
112/* 1110zzzz */
1132, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114
115/* 11110yyy */
1163, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
117};
118
119/*
120** This table maps from the number of trailing bytes in a UTF-8 character
121** to an integer constant that is effectively calculated for each character
122** read by a naive implementation of a UTF-8 character reader. The code
123** in the READ_UTF8 macro explains things best.
124*/
125static const int xtra_utf8_bits[4] = {
1260,
12712416, /* (0xC0 << 6) + (0x80) */
128925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
12963447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
130};
131
132#define READ_UTF8(zIn, c) { \
133 int xtra; \
134 c = *(zIn)++; \
135 xtra = xtra_utf8_bytes[c]; \
136 switch( xtra ){ \
137 case 255: c = (int)0xFFFD; break; \
138 case 3: c = (c<<6) + *(zIn)++; \
139 case 2: c = (c<<6) + *(zIn)++; \
140 case 1: c = (c<<6) + *(zIn)++; \
141 c -= xtra_utf8_bits[xtra]; \
142 } \
143}
144
145#define SKIP_UTF8(zIn) { \
146 zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1); \
147}
148
149#define WRITE_UTF8(zOut, c) { \
150 if( c<0x00080 ){ \
151 *zOut++ = (c&0xFF); \
152 } \
153 else if( c<0x00800 ){ \
154 *zOut++ = 0xC0 + ((c>>6)&0x1F); \
155 *zOut++ = 0x80 + (c & 0x3F); \
156 } \
157 else if( c<0x10000 ){ \
158 *zOut++ = 0xE0 + ((c>>12)&0x0F); \
159 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
160 *zOut++ = 0x80 + (c & 0x3F); \
161 }else{ \
162 *zOut++ = 0xF0 + ((c>>18) & 0x07); \
163 *zOut++ = 0x80 + ((c>>12) & 0x3F); \
164 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
165 *zOut++ = 0x80 + (c & 0x3F); \
166 } \
167}
168
169#define WRITE_UTF16LE(zOut, c) { \
170 if( c<=0xFFFF ){ \
171 *zOut++ = (c&0x00FF); \
172 *zOut++ = ((c>>8)&0x00FF); \
173 }else{ \
174 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
175 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
176 *zOut++ = (c&0x00FF); \
177 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
178 } \
179}
180
181#define WRITE_UTF16BE(zOut, c) { \
182 if( c<=0xFFFF ){ \
183 *zOut++ = ((c>>8)&0x00FF); \
184 *zOut++ = (c&0x00FF); \
185 }else{ \
186 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
187 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
188 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
189 *zOut++ = (c&0x00FF); \
190 } \
191}
192
193#define READ_UTF16LE(zIn, c){ \
194 c = (*zIn++); \
195 c += ((*zIn++)<<8); \
196 if( c>=0xD800 && c<=0xE000 ){ \
197 int c2 = (*zIn++); \
198 c2 += ((*zIn++)<<8); \
199 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
200 } \
201}
202
203#define READ_UTF16BE(zIn, c){ \
204 c = ((*zIn++)<<8); \
205 c += (*zIn++); \
206 if( c>=0xD800 && c<=0xE000 ){ \
207 int c2 = ((*zIn++)<<8); \
208 c2 += (*zIn++); \
209 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
210 } \
211}
212
danielk1977f4618892004-06-28 13:09:11 +0000213#define SKIP_UTF16BE(zIn){ \
214 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \
215 zIn += 4; \
216 }else{ \
217 zIn += 2; \
218 } \
219}
220#define SKIP_UTF16LE(zIn){ \
221 zIn++; \
222 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \
223 zIn += 3; \
224 }else{ \
225 zIn += 1; \
226 } \
227}
228
229#define RSKIP_UTF16LE(zIn){ \
230 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \
231 zIn -= 4; \
232 }else{ \
233 zIn -= 2; \
234 } \
235}
236#define RSKIP_UTF16BE(zIn){ \
237 zIn--; \
238 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \
239 zIn -= 3; \
240 }else{ \
241 zIn -= 1; \
242 } \
243}
244
danielk1977bfd6cce2004-06-18 04:24:54 +0000245/*
246** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
247** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
248*/
249/* #define TRANSLATE_TRACE 1 */
250
251/*
252** This routine transforms the internal text encoding used by pMem to
253** desiredEnc. It is an error if the string is already of the desired
254** encoding, or if *pMem does not contain a string value.
255*/
256int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
257 unsigned char zShort[NBFS]; /* Temporary short output buffer */
258 int len; /* Maximum length of output string in bytes */
259 unsigned char *zOut; /* Output buffer */
260 unsigned char *zIn; /* Input iterator */
261 unsigned char *zTerm; /* End of input */
262 unsigned char *z; /* Output iterator */
263 int c;
264
265 assert( pMem->flags&MEM_Str );
266 assert( pMem->enc!=desiredEnc );
267 assert( pMem->enc!=0 );
268 assert( pMem->n>=0 );
269
270#ifdef TRANSLATE_TRACE
271 {
272 char zBuf[100];
273 sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
274 fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977ad7dd422004-06-06 12:41:49 +0000275 }
276#endif
277
danielk1977bfd6cce2004-06-18 04:24:54 +0000278 /* If the translation is between UTF-16 little and big endian, then
279 ** all that is required is to swap the byte order. This case is handled
280 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000281 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000282 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
283 u8 temp;
284 sqlite3VdbeMemMakeWriteable(pMem);
285 zIn = pMem->z;
286 zTerm = &zIn[pMem->n];
287 while( zIn<zTerm ){
288 temp = *zIn;
289 *zIn = *(zIn+1);
290 zIn++;
291 *zIn++ = temp;
292 }
293 pMem->enc = desiredEnc;
294 goto translate_out;
295 }
296
danielk1977d7e69642004-06-23 00:23:49 +0000297 /* Set len to the maximum number of bytes required in the output buffer. */
298 if( desiredEnc==SQLITE_UTF8 ){
299 /* When converting from UTF-16, the maximum growth results from
300 ** translating a 2-byte character to a 3-byte UTF-8 character (i.e.
301 ** code-point 0xFFFC). A single byte is required for the output string
302 ** nul-terminator.
303 */
304 len = (pMem->n/2) * 3 + 1;
305 }else{
306 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
307 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
308 ** character. Two bytes are required in the output buffer for the
309 ** nul-terminator.
310 */
311 len = pMem->n * 2 + 2;
312 }
313
danielk1977bfd6cce2004-06-18 04:24:54 +0000314 /* Set zIn to point at the start of the input buffer and zTerm to point 1
315 ** byte past the end.
316 **
317 ** Variable zOut is set to point at the output buffer. This may be space
318 ** obtained from malloc(), or Mem.zShort, if it large enough and not in
319 ** use, or the zShort array on the stack (see above).
320 */
321 zIn = pMem->z;
322 zTerm = &zIn[pMem->n];
danielk1977bfd6cce2004-06-18 04:24:54 +0000323 if( len>NBFS ){
324 zOut = sqliteMallocRaw(len);
325 if( !zOut ) return SQLITE_NOMEM;
326 }else{
danielk19771ba1b552004-06-23 13:46:32 +0000327 zOut = zShort;
danielk1977bfd6cce2004-06-18 04:24:54 +0000328 }
329 z = zOut;
330
331 if( pMem->enc==SQLITE_UTF8 ){
332 if( desiredEnc==SQLITE_UTF16LE ){
333 /* UTF-8 -> UTF-16 Little-endian */
334 while( zIn<zTerm ){
335 READ_UTF8(zIn, c);
336 WRITE_UTF16LE(z, c);
337 }
338 WRITE_UTF16LE(z, 0);
339 pMem->n = (z-zOut)-2;
340 }else if( desiredEnc==SQLITE_UTF16BE ){
341 /* UTF-8 -> UTF-16 Big-endian */
342 while( zIn<zTerm ){
343 READ_UTF8(zIn, c);
344 WRITE_UTF16BE(z, c);
345 }
346 WRITE_UTF16BE(z, 0);
347 pMem->n = (z-zOut)-2;
348 }
349 }else{
350 assert( desiredEnc==SQLITE_UTF8 );
351 if( pMem->enc==SQLITE_UTF16LE ){
352 /* UTF-16 Little-endian -> UTF-8 */
353 while( zIn<zTerm ){
354 READ_UTF16LE(zIn, c);
355 WRITE_UTF8(z, c);
356 }
357 WRITE_UTF8(z, 0);
358 pMem->n = (z-zOut)-1;
359 }else{
360 /* UTF-16 Little-endian -> UTF-8 */
361 while( zIn<zTerm ){
362 READ_UTF16BE(zIn, c);
363 WRITE_UTF8(z, c);
364 }
365 WRITE_UTF8(z, 0);
366 pMem->n = (z-zOut)-1;
danielk1977998b56c2004-05-06 23:37:52 +0000367 }
368 }
danielk1977d7e69642004-06-23 00:23:49 +0000369 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000370
danielk1977bfd6cce2004-06-18 04:24:54 +0000371 sqlite3VdbeMemRelease(pMem);
372 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);
373 pMem->enc = desiredEnc;
danielk19771ba1b552004-06-23 13:46:32 +0000374 if( zOut==zShort ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000375 memcpy(pMem->zShort, zOut, len);
376 zOut = pMem->zShort;
377 pMem->flags |= (MEM_Term|MEM_Short);
378 }else{
379 pMem->flags |= (MEM_Term|MEM_Dyn);
380 }
381 pMem->z = zOut;
382
383translate_out:
384#ifdef TRANSLATE_TRACE
385 {
386 char zBuf[100];
387 sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
388 fprintf(stderr, "OUTPUT: %s\n", zBuf);
389 }
390#endif
391 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000392}
393
danielk197793d46752004-05-23 13:30:58 +0000394/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000395** This routine checks for a byte-order mark at the beginning of the
396** UTF-16 string stored in *pMem. If one is present, it is removed and
397** the encoding of the Mem adjusted. This routine does not do any
398** byte-swapping, it just sets Mem.enc appropriately.
399**
400** The allocation (static, dynamic etc.) and encoding of the Mem may be
401** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000402*/
danielk1977bfd6cce2004-06-18 04:24:54 +0000403int sqlite3VdbeMemHandleBom(Mem *pMem){
404 int rc = SQLITE_OK;
405 u8 bom = 0;
406
407 if( pMem->n<0 || pMem->n>1 ){
408 u8 b1 = *(u8 *)pMem->z;
409 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000410 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000411 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000412 }
413 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000414 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000415 }
416 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000417
418 if( bom ){
danielk19771ba1b552004-06-23 13:46:32 +0000419 /* This function is called as soon as a string is stored in a Mem*,
420 ** from within sqlite3VdbeMemSetStr(). At that point it is not possible
421 ** for the string to be stored in Mem.zShort, or for it to be stored
422 ** in dynamic memory with no destructor.
423 */
424 assert( !(pMem->flags&MEM_Short) );
425 assert( !(pMem->flags&MEM_Dyn) || pMem->xDel );
426 if( pMem->flags & MEM_Dyn ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000427 void (*xDel)(void*) = pMem->xDel;
428 char *z = pMem->z;
429 pMem->z = 0;
430 pMem->xDel = 0;
431 rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
danielk19771ba1b552004-06-23 13:46:32 +0000432 xDel(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000433 }else{
434 rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
435 SQLITE_TRANSIENT);
436 }
danielk1977998b56c2004-05-06 23:37:52 +0000437 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000438 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000439}
440
441/*
danielk19776622cce2004-05-20 11:00:52 +0000442** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
443** return the number of unicode characters in pZ up to (but not including)
444** the first 0x00 byte. If nByte is not less than zero, return the
445** number of unicode characters in the first nByte of pZ (or up to
446** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000447*/
danielk1977bfd6cce2004-06-18 04:24:54 +0000448int sqlite3utf8CharLen(const char *z, int nByte){
449 int r = 0;
450 const char *zTerm;
danielk19771ba1b552004-06-23 13:46:32 +0000451 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000452 zTerm = &z[nByte];
453 }else{
454 zTerm = (const char *)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000455 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000456 assert( z<=zTerm );
457 while( *z!=0 && z<zTerm ){
458 SKIP_UTF8(z);
459 r++;
460 }
461 return r;
danielk19776622cce2004-05-20 11:00:52 +0000462}
463
464/*
465** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
466** return the number of bytes up to (but not including), the first pair
467** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
468** then return the number of bytes in the first nChar unicode characters
469** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
470*/
danielk1977bfd6cce2004-06-18 04:24:54 +0000471int sqlite3utf16ByteLen(const void *zIn, int nChar){
472 int c = 1;
473 char const *z = zIn;
474 int n = 0;
475 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
476 while( c && ((nChar<0) || n<nChar) ){
477 READ_UTF16BE(z, c);
478 n++;
danielk19776622cce2004-05-20 11:00:52 +0000479 }
danielk19776622cce2004-05-20 11:00:52 +0000480 }else{
danielk1977bfd6cce2004-06-18 04:24:54 +0000481 while( c && ((nChar<0) || n<nChar) ){
482 READ_UTF16LE(z, c);
483 n++;
danielk19776622cce2004-05-20 11:00:52 +0000484 }
danielk19776622cce2004-05-20 11:00:52 +0000485 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000486 return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977998b56c2004-05-06 23:37:52 +0000487}
488
drha5d14fe2004-05-04 15:00:46 +0000489/*
danielk19773f6b0872004-06-17 05:36:44 +0000490** Compare two UTF-8 strings for equality using the "LIKE" operator of
491** SQL. The '%' character matches any sequence of 0 or more
492** characters and '_' matches any single character. Case is
493** not significant.
494*/
495int sqlite3utf8LikeCompare(
496 const unsigned char *zPattern,
497 const unsigned char *zString
498){
499 register int c;
500 int c2;
501
502 while( (c = LOWERCASE(*zPattern))!=0 ){
503 switch( c ){
504 case '%': {
505 while( (c=zPattern[1]) == '%' || c == '_' ){
506 if( c=='_' ){
507 if( *zString==0 ) return 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000508 SKIP_UTF8(zString);
danielk19773f6b0872004-06-17 05:36:44 +0000509 }
510 zPattern++;
511 }
512 if( c==0 ) return 1;
513 c = LOWERCASE(c);
514 while( (c2=LOWERCASE(*zString))!=0 ){
515 while( c2 != 0 && c2 != c ){
516 zString++;
517 c2 = LOWERCASE(*zString);
518 }
519 if( c2==0 ) return 0;
520 if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1;
danielk1977bfd6cce2004-06-18 04:24:54 +0000521 SKIP_UTF8(zString);
danielk19773f6b0872004-06-17 05:36:44 +0000522 }
523 return 0;
524 }
525 case '_': {
526 if( *zString==0 ) return 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000527 SKIP_UTF8(zString);
danielk19773f6b0872004-06-17 05:36:44 +0000528 zPattern++;
529 break;
530 }
531 default: {
532 if( c != LOWERCASE(*zString) ) return 0;
533 zPattern++;
534 zString++;
535 break;
536 }
537 }
538 }
539 return *zString==0;
540}
danielk1977bfd6cce2004-06-18 04:24:54 +0000541
danielk1977f4618892004-06-28 13:09:11 +0000542/*
543** UTF-16 implementation of the substr()
544*/
545void sqlite3utf16Substr(
546 sqlite3_context *context,
547 int argc,
548 sqlite3_value **argv
549){
550 int y, z;
551 unsigned char const *zStr;
552 unsigned char const *zStrEnd;
553 unsigned char const *zStart;
554 unsigned char const *zEnd;
555 int i;
556
557 zStr = (unsigned char const *)sqlite3_value_text16(argv[0]);
558 zStrEnd = &zStr[sqlite3_value_bytes16(argv[0])];
559 y = sqlite3_value_int(argv[1]);
560 z = sqlite3_value_int(argv[2]);
561
562 if( y>0 ){
563 y = y-1;
564 zStart = zStr;
565 if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
566 for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16BE(zStart);
567 }else{
568 for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16LE(zStart);
569 }
570 }else{
571 zStart = zStrEnd;
572 if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
573 for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16BE(zStart);
574 }else{
575 for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16LE(zStart);
576 }
577 for(; i<0; i++) z -= 1;
578 }
579
580 zEnd = zStart;
581 if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
582 for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16BE(zEnd);
583 }else{
584 for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16LE(zEnd);
585 }
586
587 sqlite3_result_text16(context, zStart, zEnd-zStart, SQLITE_TRANSIENT);
588}
589
drh38f82712004-06-18 17:10:16 +0000590#if defined(SQLITE_TEST)
danielk1977bfd6cce2004-06-18 04:24:54 +0000591/*
592** This routine is called from the TCL test function "translate_selftest".
593** It checks that the primitives for serializing and deserializing
594** characters in each encoding are inverses of each other.
595*/
596void sqlite3utfSelfTest(){
597 int i;
598 unsigned char zBuf[20];
599 unsigned char *z;
600 int n;
601 int c;
602
danielk19771ba1b552004-06-23 13:46:32 +0000603 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54 +0000604 z = zBuf;
605 WRITE_UTF8(z, i);
606 n = z-zBuf;
607 z = zBuf;
608 READ_UTF8(z, c);
609 assert( c==i );
610 assert( (z-zBuf)==n );
611 }
612 for(i=0; i<0x00110000; i++){
613 if( i>=0xD800 && i<=0xE000 ) continue;
614 z = zBuf;
615 WRITE_UTF16LE(z, i);
616 n = z-zBuf;
617 z = zBuf;
618 READ_UTF16LE(z, c);
619 assert( c==i );
620 assert( (z-zBuf)==n );
621 }
622 for(i=0; i<0x00110000; i++){
623 if( i>=0xD800 && i<=0xE000 ) continue;
624 z = zBuf;
625 WRITE_UTF16BE(z, i);
626 n = z-zBuf;
627 z = zBuf;
628 READ_UTF16BE(z, c);
629 assert( c==i );
630 assert( (z-zBuf)==n );
631 }
632}
633#endif