blob: 16454c2ffd8a3555f8dd4392a79f8b76583e4024 [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drh1b743be2004-06-22 22:04:46 +000015** $Id: utf.c,v 1.23 2004/06/22 22:04:46 drh Exp $
drha5d14fe2004-05-04 15:00:46 +000016**
17** Notes on UTF-8:
18**
19** Byte-0 Byte-1 Byte-2 Byte-3 Value
20** 0xxxxxxx 00000000 00000000 0xxxxxxx
21** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24**
25**
26** Notes on UTF-16: (with wwww+1==uuuuu)
27**
drh51846b52004-05-28 16:00:21 +000028** Word-0 Word-1 Value
29** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
30** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000031**
danielk1977998b56c2004-05-06 23:37:52 +000032**
drha5d14fe2004-05-04 15:00:46 +000033** BOM or Byte Order Mark:
34** 0xff 0xfe little-endian utf-16 follows
35** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000036**
37**
38** Handling of malformed strings:
39**
40** SQLite accepts and processes malformed strings without an error wherever
41** possible. However this is not possible when converting between UTF-8 and
42** UTF-16.
43**
44** When converting malformed UTF-8 strings to UTF-16, one instance of the
45** replacement character U+FFFD for each byte that cannot be interpeted as
46** part of a valid unicode character.
47**
48** When converting malformed UTF-16 strings to UTF-8, one instance of the
49** replacement character U+FFFD for each pair of bytes that cannot be
50** interpeted as part of a valid unicode character.
danielk1977bfd6cce2004-06-18 04:24:54 +000051**
52** This file contains the following public routines:
53**
54** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
55** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
56** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.
57** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.
58** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.
59**
drha5d14fe2004-05-04 15:00:46 +000060*/
danielk1977998b56c2004-05-06 23:37:52 +000061#include <assert.h>
danielk1977998b56c2004-05-06 23:37:52 +000062#include "sqliteInt.h"
danielk1977bfd6cce2004-06-18 04:24:54 +000063#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000064
65/*
danielk1977d02eb1f2004-06-06 09:44:03 +000066** The following macro, LOWERCASE(x), takes an integer representing a
67** unicode code point. The value returned is the same code point folded to
68** lower case, if applicable. SQLite currently understands the upper/lower
69** case relationship between the 26 characters used in the English
70** language only.
71**
72** This means that characters with umlauts etc. will not be folded
73** correctly (unless they are encoded as composite characters, which would
74** doubtless cause much trouble).
75*/
danielk19773f6b0872004-06-17 05:36:44 +000076#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x)
danielk1977d02eb1f2004-06-06 09:44:03 +000077static unsigned char UpperToLower[91] = {
78 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
79 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
80 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
81 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
82 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
83 122,
84};
85
86/*
danielk1977bfd6cce2004-06-18 04:24:54 +000087** This table maps from the first byte of a UTF-8 character to the number
88** of trailing bytes expected. A value '255' indicates that the table key
89** is not a legal first byte for a UTF-8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000090*/
danielk1977bfd6cce2004-06-18 04:24:54 +000091static const u8 xtra_utf8_bytes[256] = {
92/* 0xxxxxxx */
930, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
950, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
980, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
990, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
danielk1977d02eb1f2004-06-06 09:44:03 +0000101
danielk1977bfd6cce2004-06-18 04:24:54 +0000102/* 10wwwwww */
103255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
104255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
105255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
106255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
danielk1977ad7dd422004-06-06 12:41:49 +0000107
danielk1977bfd6cce2004-06-18 04:24:54 +0000108/* 110yyyyy */
1091, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1101, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111
112/* 1110zzzz */
1132, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114
115/* 11110yyy */
1163, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
117};
118
119/*
120** This table maps from the number of trailing bytes in a UTF-8 character
121** to an integer constant that is effectively calculated for each character
122** read by a naive implementation of a UTF-8 character reader. The code
123** in the READ_UTF8 macro explains things best.
124*/
125static const int xtra_utf8_bits[4] = {
1260,
12712416, /* (0xC0 << 6) + (0x80) */
128925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
12963447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
130};
131
132#define READ_UTF8(zIn, c) { \
133 int xtra; \
134 c = *(zIn)++; \
135 xtra = xtra_utf8_bytes[c]; \
136 switch( xtra ){ \
137 case 255: c = (int)0xFFFD; break; \
138 case 3: c = (c<<6) + *(zIn)++; \
139 case 2: c = (c<<6) + *(zIn)++; \
140 case 1: c = (c<<6) + *(zIn)++; \
141 c -= xtra_utf8_bits[xtra]; \
142 } \
143}
144
145#define SKIP_UTF8(zIn) { \
146 zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1); \
147}
148
149#define WRITE_UTF8(zOut, c) { \
150 if( c<0x00080 ){ \
151 *zOut++ = (c&0xFF); \
152 } \
153 else if( c<0x00800 ){ \
154 *zOut++ = 0xC0 + ((c>>6)&0x1F); \
155 *zOut++ = 0x80 + (c & 0x3F); \
156 } \
157 else if( c<0x10000 ){ \
158 *zOut++ = 0xE0 + ((c>>12)&0x0F); \
159 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
160 *zOut++ = 0x80 + (c & 0x3F); \
161 }else{ \
162 *zOut++ = 0xF0 + ((c>>18) & 0x07); \
163 *zOut++ = 0x80 + ((c>>12) & 0x3F); \
164 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
165 *zOut++ = 0x80 + (c & 0x3F); \
166 } \
167}
168
169#define WRITE_UTF16LE(zOut, c) { \
170 if( c<=0xFFFF ){ \
171 *zOut++ = (c&0x00FF); \
172 *zOut++ = ((c>>8)&0x00FF); \
173 }else{ \
174 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
175 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
176 *zOut++ = (c&0x00FF); \
177 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
178 } \
179}
180
181#define WRITE_UTF16BE(zOut, c) { \
182 if( c<=0xFFFF ){ \
183 *zOut++ = ((c>>8)&0x00FF); \
184 *zOut++ = (c&0x00FF); \
185 }else{ \
186 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
187 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
188 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
189 *zOut++ = (c&0x00FF); \
190 } \
191}
192
193#define READ_UTF16LE(zIn, c){ \
194 c = (*zIn++); \
195 c += ((*zIn++)<<8); \
196 if( c>=0xD800 && c<=0xE000 ){ \
197 int c2 = (*zIn++); \
198 c2 += ((*zIn++)<<8); \
199 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
200 } \
201}
202
203#define READ_UTF16BE(zIn, c){ \
204 c = ((*zIn++)<<8); \
205 c += (*zIn++); \
206 if( c>=0xD800 && c<=0xE000 ){ \
207 int c2 = ((*zIn++)<<8); \
208 c2 += (*zIn++); \
209 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
210 } \
211}
212
213/*
214** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
215** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
216*/
217/* #define TRANSLATE_TRACE 1 */
218
219/*
220** This routine transforms the internal text encoding used by pMem to
221** desiredEnc. It is an error if the string is already of the desired
222** encoding, or if *pMem does not contain a string value.
223*/
224int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
225 unsigned char zShort[NBFS]; /* Temporary short output buffer */
226 int len; /* Maximum length of output string in bytes */
227 unsigned char *zOut; /* Output buffer */
228 unsigned char *zIn; /* Input iterator */
229 unsigned char *zTerm; /* End of input */
230 unsigned char *z; /* Output iterator */
231 int c;
232
233 assert( pMem->flags&MEM_Str );
234 assert( pMem->enc!=desiredEnc );
235 assert( pMem->enc!=0 );
236 assert( pMem->n>=0 );
237
238#ifdef TRANSLATE_TRACE
239 {
240 char zBuf[100];
241 sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
242 fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977ad7dd422004-06-06 12:41:49 +0000243 }
244#endif
245
danielk1977bfd6cce2004-06-18 04:24:54 +0000246 /* If the translation is between UTF-16 little and big endian, then
247 ** all that is required is to swap the byte order. This case is handled
248 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000249 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000250 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
251 u8 temp;
252 sqlite3VdbeMemMakeWriteable(pMem);
253 zIn = pMem->z;
254 zTerm = &zIn[pMem->n];
255 while( zIn<zTerm ){
256 temp = *zIn;
257 *zIn = *(zIn+1);
258 zIn++;
259 *zIn++ = temp;
260 }
261 pMem->enc = desiredEnc;
262 goto translate_out;
263 }
264
265 /* Set zIn to point at the start of the input buffer and zTerm to point 1
266 ** byte past the end.
267 **
268 ** Variable zOut is set to point at the output buffer. This may be space
269 ** obtained from malloc(), or Mem.zShort, if it large enough and not in
270 ** use, or the zShort array on the stack (see above).
271 */
272 zIn = pMem->z;
273 zTerm = &zIn[pMem->n];
274 len = pMem->n*2 + 2;
275 if( len>NBFS ){
276 zOut = sqliteMallocRaw(len);
277 if( !zOut ) return SQLITE_NOMEM;
278 }else{
279 if( pMem->z==pMem->zShort ){
280 zOut = zShort;
281 }else{
282 zOut = pMem->zShort;
283 }
284 }
285 z = zOut;
286
287 if( pMem->enc==SQLITE_UTF8 ){
288 if( desiredEnc==SQLITE_UTF16LE ){
289 /* UTF-8 -> UTF-16 Little-endian */
290 while( zIn<zTerm ){
291 READ_UTF8(zIn, c);
292 WRITE_UTF16LE(z, c);
293 }
294 WRITE_UTF16LE(z, 0);
295 pMem->n = (z-zOut)-2;
296 }else if( desiredEnc==SQLITE_UTF16BE ){
297 /* UTF-8 -> UTF-16 Big-endian */
298 while( zIn<zTerm ){
299 READ_UTF8(zIn, c);
300 WRITE_UTF16BE(z, c);
301 }
302 WRITE_UTF16BE(z, 0);
303 pMem->n = (z-zOut)-2;
304 }
305 }else{
306 assert( desiredEnc==SQLITE_UTF8 );
307 if( pMem->enc==SQLITE_UTF16LE ){
308 /* UTF-16 Little-endian -> UTF-8 */
309 while( zIn<zTerm ){
310 READ_UTF16LE(zIn, c);
311 WRITE_UTF8(z, c);
312 }
313 WRITE_UTF8(z, 0);
314 pMem->n = (z-zOut)-1;
315 }else{
316 /* UTF-16 Little-endian -> UTF-8 */
317 while( zIn<zTerm ){
318 READ_UTF16BE(zIn, c);
319 WRITE_UTF8(z, c);
320 }
321 WRITE_UTF8(z, 0);
322 pMem->n = (z-zOut)-1;
danielk1977998b56c2004-05-06 23:37:52 +0000323 }
drh1b743be2004-06-22 22:04:46 +0000324 assert( pMem->n+1<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000325 }
326
danielk1977bfd6cce2004-06-18 04:24:54 +0000327 sqlite3VdbeMemRelease(pMem);
328 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);
329 pMem->enc = desiredEnc;
330 if( (char *)zOut==pMem->zShort ){
331 pMem->flags |= (MEM_Term|MEM_Short);
332 }else if( zOut==zShort ){
333 memcpy(pMem->zShort, zOut, len);
334 zOut = pMem->zShort;
335 pMem->flags |= (MEM_Term|MEM_Short);
336 }else{
337 pMem->flags |= (MEM_Term|MEM_Dyn);
338 }
339 pMem->z = zOut;
340
341translate_out:
342#ifdef TRANSLATE_TRACE
343 {
344 char zBuf[100];
345 sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
346 fprintf(stderr, "OUTPUT: %s\n", zBuf);
347 }
348#endif
349 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000350}
351
danielk197793d46752004-05-23 13:30:58 +0000352/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000353** This routine checks for a byte-order mark at the beginning of the
354** UTF-16 string stored in *pMem. If one is present, it is removed and
355** the encoding of the Mem adjusted. This routine does not do any
356** byte-swapping, it just sets Mem.enc appropriately.
357**
358** The allocation (static, dynamic etc.) and encoding of the Mem may be
359** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000360*/
danielk1977bfd6cce2004-06-18 04:24:54 +0000361int sqlite3VdbeMemHandleBom(Mem *pMem){
362 int rc = SQLITE_OK;
363 u8 bom = 0;
364
365 if( pMem->n<0 || pMem->n>1 ){
366 u8 b1 = *(u8 *)pMem->z;
367 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000368 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000369 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000370 }
371 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000372 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000373 }
374 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000375
376 if( bom ){
377 if( pMem->flags & MEM_Short ){
378 memmove(pMem->zShort, &pMem->zShort[2], NBFS-2);
379 pMem->n -= 2;
380 pMem->enc = bom;
danielk1977998b56c2004-05-06 23:37:52 +0000381 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000382 else if( pMem->flags & MEM_Dyn ){
383 void (*xDel)(void*) = pMem->xDel;
384 char *z = pMem->z;
385 pMem->z = 0;
386 pMem->xDel = 0;
387 rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
388 if( xDel ){
389 xDel(z);
390 }else{
391 sqliteFree(z);
392 }
393 }else{
394 rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
395 SQLITE_TRANSIENT);
396 }
danielk1977998b56c2004-05-06 23:37:52 +0000397 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000398 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000399}
400
401/*
danielk19776622cce2004-05-20 11:00:52 +0000402** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
403** return the number of unicode characters in pZ up to (but not including)
404** the first 0x00 byte. If nByte is not less than zero, return the
405** number of unicode characters in the first nByte of pZ (or up to
406** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000407*/
danielk1977bfd6cce2004-06-18 04:24:54 +0000408int sqlite3utf8CharLen(const char *z, int nByte){
409 int r = 0;
410 const char *zTerm;
411 if( nByte>0 ){
412 zTerm = &z[nByte];
413 }else{
414 zTerm = (const char *)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000415 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000416 assert( z<=zTerm );
417 while( *z!=0 && z<zTerm ){
418 SKIP_UTF8(z);
419 r++;
420 }
421 return r;
danielk19776622cce2004-05-20 11:00:52 +0000422}
423
424/*
425** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
426** return the number of bytes up to (but not including), the first pair
427** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
428** then return the number of bytes in the first nChar unicode characters
429** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
430*/
danielk1977bfd6cce2004-06-18 04:24:54 +0000431int sqlite3utf16ByteLen(const void *zIn, int nChar){
432 int c = 1;
433 char const *z = zIn;
434 int n = 0;
435 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
436 while( c && ((nChar<0) || n<nChar) ){
437 READ_UTF16BE(z, c);
438 n++;
danielk19776622cce2004-05-20 11:00:52 +0000439 }
danielk19776622cce2004-05-20 11:00:52 +0000440 }else{
danielk1977bfd6cce2004-06-18 04:24:54 +0000441 while( c && ((nChar<0) || n<nChar) ){
442 READ_UTF16LE(z, c);
443 n++;
danielk19776622cce2004-05-20 11:00:52 +0000444 }
danielk19776622cce2004-05-20 11:00:52 +0000445 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000446 return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977998b56c2004-05-06 23:37:52 +0000447}
448
drha5d14fe2004-05-04 15:00:46 +0000449/*
danielk19773f6b0872004-06-17 05:36:44 +0000450** Compare two UTF-8 strings for equality using the "LIKE" operator of
451** SQL. The '%' character matches any sequence of 0 or more
452** characters and '_' matches any single character. Case is
453** not significant.
454*/
455int sqlite3utf8LikeCompare(
456 const unsigned char *zPattern,
457 const unsigned char *zString
458){
459 register int c;
460 int c2;
461
462 while( (c = LOWERCASE(*zPattern))!=0 ){
463 switch( c ){
464 case '%': {
465 while( (c=zPattern[1]) == '%' || c == '_' ){
466 if( c=='_' ){
467 if( *zString==0 ) return 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000468 SKIP_UTF8(zString);
danielk19773f6b0872004-06-17 05:36:44 +0000469 }
470 zPattern++;
471 }
472 if( c==0 ) return 1;
473 c = LOWERCASE(c);
474 while( (c2=LOWERCASE(*zString))!=0 ){
475 while( c2 != 0 && c2 != c ){
476 zString++;
477 c2 = LOWERCASE(*zString);
478 }
479 if( c2==0 ) return 0;
480 if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1;
danielk1977bfd6cce2004-06-18 04:24:54 +0000481 SKIP_UTF8(zString);
danielk19773f6b0872004-06-17 05:36:44 +0000482 }
483 return 0;
484 }
485 case '_': {
486 if( *zString==0 ) return 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000487 SKIP_UTF8(zString);
danielk19773f6b0872004-06-17 05:36:44 +0000488 zPattern++;
489 break;
490 }
491 default: {
492 if( c != LOWERCASE(*zString) ) return 0;
493 zPattern++;
494 zString++;
495 break;
496 }
497 }
498 }
499 return *zString==0;
500}
danielk1977bfd6cce2004-06-18 04:24:54 +0000501
drh38f82712004-06-18 17:10:16 +0000502#if defined(SQLITE_TEST)
danielk1977bfd6cce2004-06-18 04:24:54 +0000503/*
504** This routine is called from the TCL test function "translate_selftest".
505** It checks that the primitives for serializing and deserializing
506** characters in each encoding are inverses of each other.
507*/
508void sqlite3utfSelfTest(){
509 int i;
510 unsigned char zBuf[20];
511 unsigned char *z;
512 int n;
513 int c;
514
515 for(i=0; 0 && i<0x00110000; i++){
516 z = zBuf;
517 WRITE_UTF8(z, i);
518 n = z-zBuf;
519 z = zBuf;
520 READ_UTF8(z, c);
521 assert( c==i );
522 assert( (z-zBuf)==n );
523 }
524 for(i=0; i<0x00110000; i++){
525 if( i>=0xD800 && i<=0xE000 ) continue;
526 z = zBuf;
527 WRITE_UTF16LE(z, i);
528 n = z-zBuf;
529 z = zBuf;
530 READ_UTF16LE(z, c);
531 assert( c==i );
532 assert( (z-zBuf)==n );
533 }
534 for(i=0; i<0x00110000; i++){
535 if( i>=0xD800 && i<=0xE000 ) continue;
536 z = zBuf;
537 WRITE_UTF16BE(z, i);
538 n = z-zBuf;
539 z = zBuf;
540 READ_UTF16BE(z, c);
541 assert( c==i );
542 assert( (z-zBuf)==n );
543 }
544}
545#endif