blob: 76c3a15fa8f243f50e96c35e234253e7a7a74a3f [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drh17435752007-08-16 04:30:38 +000015** $Id: utf.c,v 1.54 2007/08/16 04:30:40 drh Exp $
drha5d14fe2004-05-04 15:00:46 +000016**
17** Notes on UTF-8:
18**
19** Byte-0 Byte-1 Byte-2 Byte-3 Value
20** 0xxxxxxx 00000000 00000000 0xxxxxxx
21** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24**
25**
26** Notes on UTF-16: (with wwww+1==uuuuu)
27**
drh51846b52004-05-28 16:00:21 +000028** Word-0 Word-1 Value
29** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
30** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000031**
danielk1977998b56c2004-05-06 23:37:52 +000032**
drha5d14fe2004-05-04 15:00:46 +000033** BOM or Byte Order Mark:
34** 0xff 0xfe little-endian utf-16 follows
35** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000036**
drha5d14fe2004-05-04 15:00:46 +000037*/
danielk1977998b56c2004-05-06 23:37:52 +000038#include "sqliteInt.h"
drhb659e9b2005-01-28 01:29:08 +000039#include <assert.h>
danielk1977bfd6cce2004-06-18 04:24:54 +000040#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000041
42/*
drh38def052007-03-31 15:27:59 +000043** The following constant value is used by the SQLITE_BIGENDIAN and
44** SQLITE_LITTLEENDIAN macros.
45*/
46const int sqlite3one = 1;
47
48/*
drh4a919112007-05-15 11:55:09 +000049** This lookup table is used to help decode the first byte of
50** a multi-byte UTF8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000051*/
drh0a0e1312007-08-07 17:04:59 +000052static const unsigned char sqlite3UtfTrans1[] = {
drh4a919112007-05-15 11:55:09 +000053 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
danielk1977bfd6cce2004-06-18 04:24:54 +000061};
62
drh66150952007-07-23 19:12:41 +000063
danielk1977bfd6cce2004-06-18 04:24:54 +000064#define WRITE_UTF8(zOut, c) { \
65 if( c<0x00080 ){ \
66 *zOut++ = (c&0xFF); \
67 } \
68 else if( c<0x00800 ){ \
69 *zOut++ = 0xC0 + ((c>>6)&0x1F); \
70 *zOut++ = 0x80 + (c & 0x3F); \
71 } \
72 else if( c<0x10000 ){ \
73 *zOut++ = 0xE0 + ((c>>12)&0x0F); \
74 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
75 *zOut++ = 0x80 + (c & 0x3F); \
76 }else{ \
77 *zOut++ = 0xF0 + ((c>>18) & 0x07); \
78 *zOut++ = 0x80 + ((c>>12) & 0x3F); \
79 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
80 *zOut++ = 0x80 + (c & 0x3F); \
81 } \
82}
83
84#define WRITE_UTF16LE(zOut, c) { \
85 if( c<=0xFFFF ){ \
86 *zOut++ = (c&0x00FF); \
87 *zOut++ = ((c>>8)&0x00FF); \
88 }else{ \
89 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
90 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
91 *zOut++ = (c&0x00FF); \
92 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
93 } \
94}
95
96#define WRITE_UTF16BE(zOut, c) { \
97 if( c<=0xFFFF ){ \
98 *zOut++ = ((c>>8)&0x00FF); \
99 *zOut++ = (c&0x00FF); \
100 }else{ \
101 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
102 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
103 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
104 *zOut++ = (c&0x00FF); \
105 } \
106}
107
108#define READ_UTF16LE(zIn, c){ \
109 c = (*zIn++); \
110 c += ((*zIn++)<<8); \
danielk1977a9c16b02007-05-16 18:11:41 +0000111 if( c>=0xD800 && c<0xE000 ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000112 int c2 = (*zIn++); \
113 c2 += ((*zIn++)<<8); \
114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
drhb3fa0e02006-10-19 01:58:43 +0000115 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \
danielk1977bfd6cce2004-06-18 04:24:54 +0000116 } \
117}
118
119#define READ_UTF16BE(zIn, c){ \
120 c = ((*zIn++)<<8); \
121 c += (*zIn++); \
danielk1977a9c16b02007-05-16 18:11:41 +0000122 if( c>=0xD800 && c<0xE000 ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000123 int c2 = ((*zIn++)<<8); \
124 c2 += (*zIn++); \
125 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
drhb3fa0e02006-10-19 01:58:43 +0000126 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \
danielk1977bfd6cce2004-06-18 04:24:54 +0000127 } \
128}
129
130/*
drh66150952007-07-23 19:12:41 +0000131** Translate a single UTF-8 character. Return the unicode value.
132**
133** During translation, assume that the byte that zTerm points
134** is a 0x00.
135**
136** Write a pointer to the next unread byte back into *pzNext.
137**
138** Notes On Invalid UTF-8:
139**
140** * This routine never allows a 7-bit character (0x00 through 0x7f) to
141** be encoded as a multi-byte character. Any multi-byte character that
142** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
143**
144** * This routine never allows a UTF16 surrogate value to be encoded.
145** If a multi-byte character attempts to encode a value between
146** 0xd800 and 0xe000 then it is rendered as 0xfffd.
147**
148** * Bytes in the range of 0x80 through 0xbf which occur as the first
149** byte of a character are interpreted as single-byte characters
150** and rendered as themselves even though they are technically
151** invalid characters.
152**
153** * This routine accepts an infinite number of different UTF8 encodings
154** for unicode values 0x80 and greater. It do not change over-length
155** encodings to 0xfffd as some systems recommend.
156*/
157int sqlite3Utf8Read(
158 const unsigned char *z, /* First byte of UTF-8 character */
159 const unsigned char *zTerm, /* Pretend this byte is 0x00 */
160 const unsigned char **pzNext /* Write first byte past UTF-8 char here */
161){
162 int c = *(z++);
163 if( c>=0xc0 ){
164 c = sqlite3UtfTrans1[c-0xc0];
165 while( z!=zTerm && (*z & 0xc0)==0x80 ){
166 c = (c<<6) + (0x3f & *(z++));
167 }
168 if( c<0x80
169 || (c&0xFFFFF800)==0xD800
170 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
171 }
172 *pzNext = z;
173 return c;
174}
175
176
177
178/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000179** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
180** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
181*/
182/* #define TRANSLATE_TRACE 1 */
183
drh6c626082004-11-14 21:56:29 +0000184#ifndef SQLITE_OMIT_UTF16
danielk1977bfd6cce2004-06-18 04:24:54 +0000185/*
186** This routine transforms the internal text encoding used by pMem to
187** desiredEnc. It is an error if the string is already of the desired
188** encoding, or if *pMem does not contain a string value.
189*/
drh17435752007-08-16 04:30:38 +0000190int sqlite3VdbeMemTranslate(sqlite3 *db, Mem *pMem, u8 desiredEnc){
danielk1977bfd6cce2004-06-18 04:24:54 +0000191 unsigned char zShort[NBFS]; /* Temporary short output buffer */
192 int len; /* Maximum length of output string in bytes */
193 unsigned char *zOut; /* Output buffer */
194 unsigned char *zIn; /* Input iterator */
195 unsigned char *zTerm; /* End of input */
196 unsigned char *z; /* Output iterator */
drha39f4c52006-10-04 15:23:21 +0000197 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000198
199 assert( pMem->flags&MEM_Str );
200 assert( pMem->enc!=desiredEnc );
201 assert( pMem->enc!=0 );
202 assert( pMem->n>=0 );
203
danielk1977b5402fb2005-01-12 07:15:04 +0000204#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000205 {
206 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000207 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000208 fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977ad7dd422004-06-06 12:41:49 +0000209 }
210#endif
211
danielk1977bfd6cce2004-06-18 04:24:54 +0000212 /* If the translation is between UTF-16 little and big endian, then
213 ** all that is required is to swap the byte order. This case is handled
214 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000215 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000216 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
217 u8 temp;
drh71c697e2004-08-08 23:39:19 +0000218 int rc;
219 rc = sqlite3VdbeMemMakeWriteable(pMem);
220 if( rc!=SQLITE_OK ){
221 assert( rc==SQLITE_NOMEM );
222 return SQLITE_NOMEM;
223 }
drh2646da72005-12-09 20:02:05 +0000224 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000225 zTerm = &zIn[pMem->n];
226 while( zIn<zTerm ){
227 temp = *zIn;
228 *zIn = *(zIn+1);
229 zIn++;
230 *zIn++ = temp;
231 }
232 pMem->enc = desiredEnc;
233 goto translate_out;
234 }
235
danielk1977d7e69642004-06-23 00:23:49 +0000236 /* Set len to the maximum number of bytes required in the output buffer. */
237 if( desiredEnc==SQLITE_UTF8 ){
238 /* When converting from UTF-16, the maximum growth results from
drha49b8612006-04-16 12:05:03 +0000239 ** translating a 2-byte character to a 4-byte UTF-8 character.
240 ** A single byte is required for the output string
danielk1977d7e69642004-06-23 00:23:49 +0000241 ** nul-terminator.
242 */
drha49b8612006-04-16 12:05:03 +0000243 len = pMem->n * 2 + 1;
danielk1977d7e69642004-06-23 00:23:49 +0000244 }else{
245 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
246 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
247 ** character. Two bytes are required in the output buffer for the
248 ** nul-terminator.
249 */
250 len = pMem->n * 2 + 2;
251 }
252
danielk1977bfd6cce2004-06-18 04:24:54 +0000253 /* Set zIn to point at the start of the input buffer and zTerm to point 1
254 ** byte past the end.
255 **
256 ** Variable zOut is set to point at the output buffer. This may be space
drh17435752007-08-16 04:30:38 +0000257 ** obtained from sqlite3_malloc(), or Mem.zShort, if it large enough and
258 ** not in use, or the zShort array on the stack (see above).
danielk1977bfd6cce2004-06-18 04:24:54 +0000259 */
drh2646da72005-12-09 20:02:05 +0000260 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000261 zTerm = &zIn[pMem->n];
danielk1977bfd6cce2004-06-18 04:24:54 +0000262 if( len>NBFS ){
drh17435752007-08-16 04:30:38 +0000263 zOut = sqlite3DbMallocRaw(db, len);
danielk1977bfd6cce2004-06-18 04:24:54 +0000264 if( !zOut ) return SQLITE_NOMEM;
265 }else{
danielk19771ba1b552004-06-23 13:46:32 +0000266 zOut = zShort;
danielk1977bfd6cce2004-06-18 04:24:54 +0000267 }
268 z = zOut;
269
270 if( pMem->enc==SQLITE_UTF8 ){
271 if( desiredEnc==SQLITE_UTF16LE ){
272 /* UTF-8 -> UTF-16 Little-endian */
273 while( zIn<zTerm ){
drh66150952007-07-23 19:12:41 +0000274 c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
danielk1977bfd6cce2004-06-18 04:24:54 +0000275 WRITE_UTF16LE(z, c);
276 }
drhb8dd3152004-09-24 23:20:51 +0000277 }else{
278 assert( desiredEnc==SQLITE_UTF16BE );
danielk1977bfd6cce2004-06-18 04:24:54 +0000279 /* UTF-8 -> UTF-16 Big-endian */
280 while( zIn<zTerm ){
drh66150952007-07-23 19:12:41 +0000281 c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
danielk1977bfd6cce2004-06-18 04:24:54 +0000282 WRITE_UTF16BE(z, c);
283 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000284 }
drhb8dd3152004-09-24 23:20:51 +0000285 pMem->n = z - zOut;
286 *z++ = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000287 }else{
288 assert( desiredEnc==SQLITE_UTF8 );
289 if( pMem->enc==SQLITE_UTF16LE ){
290 /* UTF-16 Little-endian -> UTF-8 */
291 while( zIn<zTerm ){
292 READ_UTF16LE(zIn, c);
293 WRITE_UTF8(z, c);
294 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000295 }else{
296 /* UTF-16 Little-endian -> UTF-8 */
297 while( zIn<zTerm ){
298 READ_UTF16BE(zIn, c);
299 WRITE_UTF8(z, c);
300 }
danielk1977998b56c2004-05-06 23:37:52 +0000301 }
drhb8dd3152004-09-24 23:20:51 +0000302 pMem->n = z - zOut;
danielk1977998b56c2004-05-06 23:37:52 +0000303 }
drhb8dd3152004-09-24 23:20:51 +0000304 *z = 0;
danielk1977d7e69642004-06-23 00:23:49 +0000305 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000306
danielk1977bfd6cce2004-06-18 04:24:54 +0000307 sqlite3VdbeMemRelease(pMem);
308 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);
309 pMem->enc = desiredEnc;
danielk19771ba1b552004-06-23 13:46:32 +0000310 if( zOut==zShort ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000311 memcpy(pMem->zShort, zOut, len);
drh2646da72005-12-09 20:02:05 +0000312 zOut = (u8*)pMem->zShort;
danielk1977bfd6cce2004-06-18 04:24:54 +0000313 pMem->flags |= (MEM_Term|MEM_Short);
314 }else{
315 pMem->flags |= (MEM_Term|MEM_Dyn);
316 }
drh2646da72005-12-09 20:02:05 +0000317 pMem->z = (char*)zOut;
danielk1977bfd6cce2004-06-18 04:24:54 +0000318
319translate_out:
danielk1977b5402fb2005-01-12 07:15:04 +0000320#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000321 {
322 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000323 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000324 fprintf(stderr, "OUTPUT: %s\n", zBuf);
325 }
326#endif
327 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000328}
329
danielk197793d46752004-05-23 13:30:58 +0000330/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000331** This routine checks for a byte-order mark at the beginning of the
332** UTF-16 string stored in *pMem. If one is present, it is removed and
333** the encoding of the Mem adjusted. This routine does not do any
334** byte-swapping, it just sets Mem.enc appropriately.
335**
336** The allocation (static, dynamic etc.) and encoding of the Mem may be
337** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000338*/
danielk1977bfd6cce2004-06-18 04:24:54 +0000339int sqlite3VdbeMemHandleBom(Mem *pMem){
340 int rc = SQLITE_OK;
341 u8 bom = 0;
342
343 if( pMem->n<0 || pMem->n>1 ){
344 u8 b1 = *(u8 *)pMem->z;
345 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000346 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000347 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000348 }
349 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000350 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000351 }
352 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000353
354 if( bom ){
danielk19771ba1b552004-06-23 13:46:32 +0000355 /* This function is called as soon as a string is stored in a Mem*,
356 ** from within sqlite3VdbeMemSetStr(). At that point it is not possible
357 ** for the string to be stored in Mem.zShort, or for it to be stored
358 ** in dynamic memory with no destructor.
359 */
360 assert( !(pMem->flags&MEM_Short) );
361 assert( !(pMem->flags&MEM_Dyn) || pMem->xDel );
362 if( pMem->flags & MEM_Dyn ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000363 void (*xDel)(void*) = pMem->xDel;
364 char *z = pMem->z;
365 pMem->z = 0;
366 pMem->xDel = 0;
367 rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
danielk19771ba1b552004-06-23 13:46:32 +0000368 xDel(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000369 }else{
370 rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
371 SQLITE_TRANSIENT);
372 }
danielk1977998b56c2004-05-06 23:37:52 +0000373 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000374 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000375}
drh6c626082004-11-14 21:56:29 +0000376#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000377
378/*
danielk19776622cce2004-05-20 11:00:52 +0000379** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
380** return the number of unicode characters in pZ up to (but not including)
381** the first 0x00 byte. If nByte is not less than zero, return the
382** number of unicode characters in the first nByte of pZ (or up to
383** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000384*/
drh4a919112007-05-15 11:55:09 +0000385int sqlite3Utf8CharLen(const char *zIn, int nByte){
danielk1977bfd6cce2004-06-18 04:24:54 +0000386 int r = 0;
drh4a919112007-05-15 11:55:09 +0000387 const u8 *z = (const u8*)zIn;
388 const u8 *zTerm;
danielk19771ba1b552004-06-23 13:46:32 +0000389 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000390 zTerm = &z[nByte];
391 }else{
drh4a919112007-05-15 11:55:09 +0000392 zTerm = (const u8*)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000393 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000394 assert( z<=zTerm );
395 while( *z!=0 && z<zTerm ){
drh4a919112007-05-15 11:55:09 +0000396 SQLITE_SKIP_UTF8(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000397 r++;
398 }
399 return r;
danielk19776622cce2004-05-20 11:00:52 +0000400}
401
drh6c626082004-11-14 21:56:29 +0000402#ifndef SQLITE_OMIT_UTF16
danielk19776622cce2004-05-20 11:00:52 +0000403/*
drhaf9a7c22005-12-15 03:04:10 +0000404** Convert a UTF-16 string in the native encoding into a UTF-8 string.
drh17435752007-08-16 04:30:38 +0000405** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
406** be freed by the calling function.
drhaf9a7c22005-12-15 03:04:10 +0000407**
408** NULL is returned if there is an allocation error.
409*/
drh17435752007-08-16 04:30:38 +0000410char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){
drhaf9a7c22005-12-15 03:04:10 +0000411 Mem m;
412 memset(&m, 0, sizeof(m));
drh17435752007-08-16 04:30:38 +0000413 sqlite3VdbeMemSetStr(db, &m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC);
414 sqlite3VdbeChangeEncoding(db, &m, SQLITE_UTF8);
415 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
416 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
417 return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z);
drhaf9a7c22005-12-15 03:04:10 +0000418}
419
420/*
danielk19776622cce2004-05-20 11:00:52 +0000421** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
422** return the number of bytes up to (but not including), the first pair
423** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
424** then return the number of bytes in the first nChar unicode characters
425** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
426*/
drhee858132007-05-08 20:37:38 +0000427int sqlite3Utf16ByteLen(const void *zIn, int nChar){
drha39f4c52006-10-04 15:23:21 +0000428 unsigned int c = 1;
danielk1977bfd6cce2004-06-18 04:24:54 +0000429 char const *z = zIn;
430 int n = 0;
431 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
danielk1977161fb792006-01-24 10:58:21 +0000432 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here
433 ** and in other parts of this file means that at one branch will
434 ** not be covered by coverage testing on any single host. But coverage
435 ** will be complete if the tests are run on both a little-endian and
436 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE
437 ** macros are constant at compile time the compiler can determine
438 ** which branch will be followed. It is therefore assumed that no runtime
439 ** penalty is paid for this "if" statement.
440 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000441 while( c && ((nChar<0) || n<nChar) ){
442 READ_UTF16BE(z, c);
443 n++;
danielk19776622cce2004-05-20 11:00:52 +0000444 }
danielk19776622cce2004-05-20 11:00:52 +0000445 }else{
danielk1977bfd6cce2004-06-18 04:24:54 +0000446 while( c && ((nChar<0) || n<nChar) ){
447 READ_UTF16LE(z, c);
448 n++;
danielk19776622cce2004-05-20 11:00:52 +0000449 }
danielk19776622cce2004-05-20 11:00:52 +0000450 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000451 return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977998b56c2004-05-06 23:37:52 +0000452}
453
drh38f82712004-06-18 17:10:16 +0000454#if defined(SQLITE_TEST)
danielk1977bfd6cce2004-06-18 04:24:54 +0000455/*
drh53c14022007-05-10 17:23:11 +0000456** Translate UTF-8 to UTF-8.
457**
458** This has the effect of making sure that the string is well-formed
459** UTF-8. Miscoded characters are removed.
460**
461** The translation is done in-place (since it is impossible for the
462** correct UTF-8 encoding to be longer than a malformed encoding).
463*/
464int sqlite3Utf8To8(unsigned char *zIn){
465 unsigned char *zOut = zIn;
466 unsigned char *zStart = zIn;
drh66150952007-07-23 19:12:41 +0000467 unsigned char *zTerm;
468 u32 c;
drh53c14022007-05-10 17:23:11 +0000469
drh66150952007-07-23 19:12:41 +0000470 while( zIn[0] ){
471 c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
drh53c14022007-05-10 17:23:11 +0000472 if( c!=0xfffd ){
473 WRITE_UTF8(zOut, c);
474 }
475 }
476 *zOut = 0;
477 return zOut - zStart;
478}
479#endif
480
481#if defined(SQLITE_TEST)
482/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000483** This routine is called from the TCL test function "translate_selftest".
484** It checks that the primitives for serializing and deserializing
485** characters in each encoding are inverses of each other.
486*/
drhee858132007-05-08 20:37:38 +0000487void sqlite3UtfSelfTest(){
drhb3fa0e02006-10-19 01:58:43 +0000488 unsigned int i, t;
danielk1977bfd6cce2004-06-18 04:24:54 +0000489 unsigned char zBuf[20];
490 unsigned char *z;
drh66150952007-07-23 19:12:41 +0000491 unsigned char *zTerm;
danielk1977bfd6cce2004-06-18 04:24:54 +0000492 int n;
drha39f4c52006-10-04 15:23:21 +0000493 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000494
danielk19771ba1b552004-06-23 13:46:32 +0000495 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54 +0000496 z = zBuf;
497 WRITE_UTF8(z, i);
498 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000499 z[0] = 0;
drh66150952007-07-23 19:12:41 +0000500 zTerm = z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000501 z = zBuf;
drh66150952007-07-23 19:12:41 +0000502 c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);
drhb3fa0e02006-10-19 01:58:43 +0000503 t = i;
504 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
505 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
506 assert( c==t );
danielk1977bfd6cce2004-06-18 04:24:54 +0000507 assert( (z-zBuf)==n );
508 }
509 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000510 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000511 z = zBuf;
512 WRITE_UTF16LE(z, i);
513 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000514 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000515 z = zBuf;
516 READ_UTF16LE(z, c);
517 assert( c==i );
518 assert( (z-zBuf)==n );
519 }
520 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000521 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000522 z = zBuf;
523 WRITE_UTF16BE(z, i);
524 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000525 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000526 z = zBuf;
527 READ_UTF16BE(z, c);
528 assert( c==i );
529 assert( (z-zBuf)==n );
530 }
531}
drh6c626082004-11-14 21:56:29 +0000532#endif /* SQLITE_TEST */
533#endif /* SQLITE_OMIT_UTF16 */