blob: 5b23d37fa5ceafe1c14f71aef9e19a0ec2d3e19c [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drhbbf695d2008-11-07 03:29:33 +000015** $Id: utf.c,v 1.66 2008/11/07 03:29:34 drh Exp $
drha5d14fe2004-05-04 15:00:46 +000016**
17** Notes on UTF-8:
18**
19** Byte-0 Byte-1 Byte-2 Byte-3 Value
20** 0xxxxxxx 00000000 00000000 0xxxxxxx
21** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24**
25**
26** Notes on UTF-16: (with wwww+1==uuuuu)
27**
drh51846b52004-05-28 16:00:21 +000028** Word-0 Word-1 Value
29** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
30** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000031**
danielk1977998b56c2004-05-06 23:37:52 +000032**
drha5d14fe2004-05-04 15:00:46 +000033** BOM or Byte Order Mark:
34** 0xff 0xfe little-endian utf-16 follows
35** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000036**
drha5d14fe2004-05-04 15:00:46 +000037*/
danielk1977998b56c2004-05-06 23:37:52 +000038#include "sqliteInt.h"
drhb659e9b2005-01-28 01:29:08 +000039#include <assert.h>
danielk1977bfd6cce2004-06-18 04:24:54 +000040#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000041
42/*
drh38def052007-03-31 15:27:59 +000043** The following constant value is used by the SQLITE_BIGENDIAN and
44** SQLITE_LITTLEENDIAN macros.
45*/
46const int sqlite3one = 1;
47
48/*
drh4a919112007-05-15 11:55:09 +000049** This lookup table is used to help decode the first byte of
50** a multi-byte UTF8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000051*/
drh0a0e1312007-08-07 17:04:59 +000052static const unsigned char sqlite3UtfTrans1[] = {
drh4a919112007-05-15 11:55:09 +000053 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
danielk1977bfd6cce2004-06-18 04:24:54 +000061};
62
drh66150952007-07-23 19:12:41 +000063
danielk1977bfd6cce2004-06-18 04:24:54 +000064#define WRITE_UTF8(zOut, c) { \
65 if( c<0x00080 ){ \
66 *zOut++ = (c&0xFF); \
67 } \
68 else if( c<0x00800 ){ \
69 *zOut++ = 0xC0 + ((c>>6)&0x1F); \
70 *zOut++ = 0x80 + (c & 0x3F); \
71 } \
72 else if( c<0x10000 ){ \
73 *zOut++ = 0xE0 + ((c>>12)&0x0F); \
74 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
75 *zOut++ = 0x80 + (c & 0x3F); \
76 }else{ \
77 *zOut++ = 0xF0 + ((c>>18) & 0x07); \
78 *zOut++ = 0x80 + ((c>>12) & 0x3F); \
79 *zOut++ = 0x80 + ((c>>6) & 0x3F); \
80 *zOut++ = 0x80 + (c & 0x3F); \
81 } \
82}
83
84#define WRITE_UTF16LE(zOut, c) { \
85 if( c<=0xFFFF ){ \
86 *zOut++ = (c&0x00FF); \
87 *zOut++ = ((c>>8)&0x00FF); \
88 }else{ \
89 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
90 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
91 *zOut++ = (c&0x00FF); \
92 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
93 } \
94}
95
96#define WRITE_UTF16BE(zOut, c) { \
97 if( c<=0xFFFF ){ \
98 *zOut++ = ((c>>8)&0x00FF); \
99 *zOut++ = (c&0x00FF); \
100 }else{ \
101 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
102 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
103 *zOut++ = (0x00DC + ((c>>8)&0x03)); \
104 *zOut++ = (c&0x00FF); \
105 } \
106}
107
108#define READ_UTF16LE(zIn, c){ \
109 c = (*zIn++); \
110 c += ((*zIn++)<<8); \
danielk1977a9c16b02007-05-16 18:11:41 +0000111 if( c>=0xD800 && c<0xE000 ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000112 int c2 = (*zIn++); \
113 c2 += ((*zIn++)<<8); \
114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
drhb3fa0e02006-10-19 01:58:43 +0000115 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \
danielk1977bfd6cce2004-06-18 04:24:54 +0000116 } \
117}
118
119#define READ_UTF16BE(zIn, c){ \
120 c = ((*zIn++)<<8); \
121 c += (*zIn++); \
danielk1977a9c16b02007-05-16 18:11:41 +0000122 if( c>=0xD800 && c<0xE000 ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000123 int c2 = ((*zIn++)<<8); \
124 c2 += (*zIn++); \
125 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
drhb3fa0e02006-10-19 01:58:43 +0000126 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \
danielk1977bfd6cce2004-06-18 04:24:54 +0000127 } \
128}
129
130/*
drh66150952007-07-23 19:12:41 +0000131** Translate a single UTF-8 character. Return the unicode value.
132**
133** During translation, assume that the byte that zTerm points
134** is a 0x00.
135**
136** Write a pointer to the next unread byte back into *pzNext.
137**
138** Notes On Invalid UTF-8:
139**
140** * This routine never allows a 7-bit character (0x00 through 0x7f) to
141** be encoded as a multi-byte character. Any multi-byte character that
142** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
143**
144** * This routine never allows a UTF16 surrogate value to be encoded.
145** If a multi-byte character attempts to encode a value between
146** 0xd800 and 0xe000 then it is rendered as 0xfffd.
147**
148** * Bytes in the range of 0x80 through 0xbf which occur as the first
149** byte of a character are interpreted as single-byte characters
150** and rendered as themselves even though they are technically
151** invalid characters.
152**
153** * This routine accepts an infinite number of different UTF8 encodings
154** for unicode values 0x80 and greater. It do not change over-length
155** encodings to 0xfffd as some systems recommend.
156*/
danielk1977ad76a81e2008-07-29 11:25:14 +0000157#define READ_UTF8(zIn, zTerm, c) \
158 c = *(zIn++); \
159 if( c>=0xc0 ){ \
160 c = sqlite3UtfTrans1[c-0xc0]; \
161 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
162 c = (c<<6) + (0x3f & *(zIn++)); \
163 } \
164 if( c<0x80 \
165 || (c&0xFFFFF800)==0xD800 \
166 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
167 }
drh66150952007-07-23 19:12:41 +0000168int sqlite3Utf8Read(
169 const unsigned char *z, /* First byte of UTF-8 character */
170 const unsigned char *zTerm, /* Pretend this byte is 0x00 */
171 const unsigned char **pzNext /* Write first byte past UTF-8 char here */
172){
danielk1977ad76a81e2008-07-29 11:25:14 +0000173 int c;
174 READ_UTF8(z, zTerm, c);
drh66150952007-07-23 19:12:41 +0000175 *pzNext = z;
176 return c;
177}
178
179
180
danielk1977ad76a81e2008-07-29 11:25:14 +0000181
drh66150952007-07-23 19:12:41 +0000182/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000183** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
184** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
185*/
186/* #define TRANSLATE_TRACE 1 */
187
drh6c626082004-11-14 21:56:29 +0000188#ifndef SQLITE_OMIT_UTF16
danielk1977bfd6cce2004-06-18 04:24:54 +0000189/*
190** This routine transforms the internal text encoding used by pMem to
191** desiredEnc. It is an error if the string is already of the desired
192** encoding, or if *pMem does not contain a string value.
193*/
drhb21c8cd2007-08-21 19:33:56 +0000194int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
danielk1977bfd6cce2004-06-18 04:24:54 +0000195 int len; /* Maximum length of output string in bytes */
196 unsigned char *zOut; /* Output buffer */
197 unsigned char *zIn; /* Input iterator */
198 unsigned char *zTerm; /* End of input */
199 unsigned char *z; /* Output iterator */
drha39f4c52006-10-04 15:23:21 +0000200 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000201
drhb21c8cd2007-08-21 19:33:56 +0000202 assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
danielk1977bfd6cce2004-06-18 04:24:54 +0000203 assert( pMem->flags&MEM_Str );
204 assert( pMem->enc!=desiredEnc );
205 assert( pMem->enc!=0 );
206 assert( pMem->n>=0 );
207
danielk1977b5402fb2005-01-12 07:15:04 +0000208#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000209 {
210 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000211 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000212 fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977ad7dd422004-06-06 12:41:49 +0000213 }
214#endif
215
danielk1977bfd6cce2004-06-18 04:24:54 +0000216 /* If the translation is between UTF-16 little and big endian, then
217 ** all that is required is to swap the byte order. This case is handled
218 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000219 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000220 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
221 u8 temp;
drh71c697e2004-08-08 23:39:19 +0000222 int rc;
drhb21c8cd2007-08-21 19:33:56 +0000223 rc = sqlite3VdbeMemMakeWriteable(pMem);
drh71c697e2004-08-08 23:39:19 +0000224 if( rc!=SQLITE_OK ){
225 assert( rc==SQLITE_NOMEM );
226 return SQLITE_NOMEM;
227 }
drh2646da72005-12-09 20:02:05 +0000228 zIn = (u8*)pMem->z;
drhbbf695d2008-11-07 03:29:33 +0000229 zTerm = &zIn[pMem->n&~1];
danielk1977bfd6cce2004-06-18 04:24:54 +0000230 while( zIn<zTerm ){
231 temp = *zIn;
232 *zIn = *(zIn+1);
233 zIn++;
234 *zIn++ = temp;
235 }
236 pMem->enc = desiredEnc;
237 goto translate_out;
238 }
239
danielk1977d7e69642004-06-23 00:23:49 +0000240 /* Set len to the maximum number of bytes required in the output buffer. */
241 if( desiredEnc==SQLITE_UTF8 ){
242 /* When converting from UTF-16, the maximum growth results from
drha49b8612006-04-16 12:05:03 +0000243 ** translating a 2-byte character to a 4-byte UTF-8 character.
244 ** A single byte is required for the output string
danielk1977d7e69642004-06-23 00:23:49 +0000245 ** nul-terminator.
246 */
drhbbf695d2008-11-07 03:29:33 +0000247 pMem->n &= ~1;
drha49b8612006-04-16 12:05:03 +0000248 len = pMem->n * 2 + 1;
danielk1977d7e69642004-06-23 00:23:49 +0000249 }else{
250 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
251 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
252 ** character. Two bytes are required in the output buffer for the
253 ** nul-terminator.
254 */
255 len = pMem->n * 2 + 2;
256 }
257
danielk1977bfd6cce2004-06-18 04:24:54 +0000258 /* Set zIn to point at the start of the input buffer and zTerm to point 1
259 ** byte past the end.
260 **
danielk1977a7a8e142008-02-13 18:25:27 +0000261 ** Variable zOut is set to point at the output buffer, space obtained
262 ** from sqlite3_malloc().
danielk1977bfd6cce2004-06-18 04:24:54 +0000263 */
drh2646da72005-12-09 20:02:05 +0000264 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000265 zTerm = &zIn[pMem->n];
danielk1977a7a8e142008-02-13 18:25:27 +0000266 zOut = sqlite3DbMallocRaw(pMem->db, len);
267 if( !zOut ){
268 return SQLITE_NOMEM;
danielk1977bfd6cce2004-06-18 04:24:54 +0000269 }
270 z = zOut;
271
272 if( pMem->enc==SQLITE_UTF8 ){
273 if( desiredEnc==SQLITE_UTF16LE ){
274 /* UTF-8 -> UTF-16 Little-endian */
275 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000276 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
277 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000278 WRITE_UTF16LE(z, c);
279 }
drhb8dd3152004-09-24 23:20:51 +0000280 }else{
281 assert( desiredEnc==SQLITE_UTF16BE );
danielk1977bfd6cce2004-06-18 04:24:54 +0000282 /* UTF-8 -> UTF-16 Big-endian */
283 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000284 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
285 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000286 WRITE_UTF16BE(z, c);
287 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000288 }
drhb8dd3152004-09-24 23:20:51 +0000289 pMem->n = z - zOut;
290 *z++ = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000291 }else{
292 assert( desiredEnc==SQLITE_UTF8 );
293 if( pMem->enc==SQLITE_UTF16LE ){
294 /* UTF-16 Little-endian -> UTF-8 */
295 while( zIn<zTerm ){
296 READ_UTF16LE(zIn, c);
297 WRITE_UTF8(z, c);
298 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000299 }else{
mihailim7ffb2b52008-06-27 18:59:44 +0000300 /* UTF-16 Big-endian -> UTF-8 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000301 while( zIn<zTerm ){
302 READ_UTF16BE(zIn, c);
303 WRITE_UTF8(z, c);
304 }
danielk1977998b56c2004-05-06 23:37:52 +0000305 }
drhb8dd3152004-09-24 23:20:51 +0000306 pMem->n = z - zOut;
danielk1977998b56c2004-05-06 23:37:52 +0000307 }
drhb8dd3152004-09-24 23:20:51 +0000308 *z = 0;
danielk1977d7e69642004-06-23 00:23:49 +0000309 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000310
danielk1977bfd6cce2004-06-18 04:24:54 +0000311 sqlite3VdbeMemRelease(pMem);
danielk1977a7a8e142008-02-13 18:25:27 +0000312 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
danielk1977bfd6cce2004-06-18 04:24:54 +0000313 pMem->enc = desiredEnc;
danielk1977a7a8e142008-02-13 18:25:27 +0000314 pMem->flags |= (MEM_Term|MEM_Dyn);
drh2646da72005-12-09 20:02:05 +0000315 pMem->z = (char*)zOut;
danielk19775f096132008-03-28 15:44:09 +0000316 pMem->zMalloc = pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000317
318translate_out:
danielk1977b5402fb2005-01-12 07:15:04 +0000319#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000320 {
321 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000322 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000323 fprintf(stderr, "OUTPUT: %s\n", zBuf);
324 }
325#endif
326 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000327}
328
danielk197793d46752004-05-23 13:30:58 +0000329/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000330** This routine checks for a byte-order mark at the beginning of the
331** UTF-16 string stored in *pMem. If one is present, it is removed and
332** the encoding of the Mem adjusted. This routine does not do any
333** byte-swapping, it just sets Mem.enc appropriately.
334**
335** The allocation (static, dynamic etc.) and encoding of the Mem may be
336** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000337*/
drhb21c8cd2007-08-21 19:33:56 +0000338int sqlite3VdbeMemHandleBom(Mem *pMem){
danielk1977bfd6cce2004-06-18 04:24:54 +0000339 int rc = SQLITE_OK;
340 u8 bom = 0;
341
342 if( pMem->n<0 || pMem->n>1 ){
343 u8 b1 = *(u8 *)pMem->z;
344 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000345 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000346 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000347 }
348 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000349 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000350 }
351 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000352
353 if( bom ){
danielk1977a7a8e142008-02-13 18:25:27 +0000354 rc = sqlite3VdbeMemMakeWriteable(pMem);
355 if( rc==SQLITE_OK ){
356 pMem->n -= 2;
357 memmove(pMem->z, &pMem->z[2], pMem->n);
358 pMem->z[pMem->n] = '\0';
359 pMem->z[pMem->n+1] = '\0';
360 pMem->flags |= MEM_Term;
361 pMem->enc = bom;
danielk1977bfd6cce2004-06-18 04:24:54 +0000362 }
danielk1977998b56c2004-05-06 23:37:52 +0000363 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000364 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000365}
drh6c626082004-11-14 21:56:29 +0000366#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000367
368/*
danielk19776622cce2004-05-20 11:00:52 +0000369** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
370** return the number of unicode characters in pZ up to (but not including)
371** the first 0x00 byte. If nByte is not less than zero, return the
372** number of unicode characters in the first nByte of pZ (or up to
373** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000374*/
drh4a919112007-05-15 11:55:09 +0000375int sqlite3Utf8CharLen(const char *zIn, int nByte){
danielk1977bfd6cce2004-06-18 04:24:54 +0000376 int r = 0;
drh4a919112007-05-15 11:55:09 +0000377 const u8 *z = (const u8*)zIn;
378 const u8 *zTerm;
danielk19771ba1b552004-06-23 13:46:32 +0000379 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000380 zTerm = &z[nByte];
381 }else{
drh4a919112007-05-15 11:55:09 +0000382 zTerm = (const u8*)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000383 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000384 assert( z<=zTerm );
385 while( *z!=0 && z<zTerm ){
drh4a919112007-05-15 11:55:09 +0000386 SQLITE_SKIP_UTF8(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000387 r++;
388 }
389 return r;
danielk19776622cce2004-05-20 11:00:52 +0000390}
391
danielk19774152e672007-09-12 17:01:45 +0000392/* This test function is not currently used by the automated test-suite.
393** Hence it is only available in debug builds.
394*/
395#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
396/*
397** Translate UTF-8 to UTF-8.
398**
399** This has the effect of making sure that the string is well-formed
400** UTF-8. Miscoded characters are removed.
401**
402** The translation is done in-place (since it is impossible for the
403** correct UTF-8 encoding to be longer than a malformed encoding).
404*/
405int sqlite3Utf8To8(unsigned char *zIn){
406 unsigned char *zOut = zIn;
407 unsigned char *zStart = zIn;
danielk19776e891622008-08-12 14:48:40 +0000408 unsigned char *zTerm = &zIn[strlen((char *)zIn)];
danielk19774152e672007-09-12 17:01:45 +0000409 u32 c;
410
411 while( zIn[0] ){
412 c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
413 if( c!=0xfffd ){
414 WRITE_UTF8(zOut, c);
415 }
416 }
417 *zOut = 0;
418 return zOut - zStart;
419}
420#endif
421
drh6c626082004-11-14 21:56:29 +0000422#ifndef SQLITE_OMIT_UTF16
danielk19776622cce2004-05-20 11:00:52 +0000423/*
drhaf9a7c22005-12-15 03:04:10 +0000424** Convert a UTF-16 string in the native encoding into a UTF-8 string.
drh17435752007-08-16 04:30:38 +0000425** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
426** be freed by the calling function.
drhaf9a7c22005-12-15 03:04:10 +0000427**
428** NULL is returned if there is an allocation error.
429*/
drh17435752007-08-16 04:30:38 +0000430char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){
drhaf9a7c22005-12-15 03:04:10 +0000431 Mem m;
432 memset(&m, 0, sizeof(m));
drhb21c8cd2007-08-21 19:33:56 +0000433 m.db = db;
434 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC);
435 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
danielk1977ae72d982007-10-03 08:46:44 +0000436 if( db->mallocFailed ){
437 sqlite3VdbeMemRelease(&m);
438 m.z = 0;
439 }
drh17435752007-08-16 04:30:38 +0000440 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
441 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
442 return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z);
drhaf9a7c22005-12-15 03:04:10 +0000443}
444
445/*
danielk19776622cce2004-05-20 11:00:52 +0000446** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
447** return the number of bytes up to (but not including), the first pair
448** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
449** then return the number of bytes in the first nChar unicode characters
450** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
451*/
drhee858132007-05-08 20:37:38 +0000452int sqlite3Utf16ByteLen(const void *zIn, int nChar){
drha39f4c52006-10-04 15:23:21 +0000453 unsigned int c = 1;
danielk1977bfd6cce2004-06-18 04:24:54 +0000454 char const *z = zIn;
455 int n = 0;
456 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
danielk1977161fb792006-01-24 10:58:21 +0000457 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here
458 ** and in other parts of this file means that at one branch will
459 ** not be covered by coverage testing on any single host. But coverage
460 ** will be complete if the tests are run on both a little-endian and
461 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE
462 ** macros are constant at compile time the compiler can determine
463 ** which branch will be followed. It is therefore assumed that no runtime
464 ** penalty is paid for this "if" statement.
465 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000466 while( c && ((nChar<0) || n<nChar) ){
467 READ_UTF16BE(z, c);
468 n++;
danielk19776622cce2004-05-20 11:00:52 +0000469 }
danielk19776622cce2004-05-20 11:00:52 +0000470 }else{
danielk1977bfd6cce2004-06-18 04:24:54 +0000471 while( c && ((nChar<0) || n<nChar) ){
472 READ_UTF16LE(z, c);
473 n++;
danielk19776622cce2004-05-20 11:00:52 +0000474 }
danielk19776622cce2004-05-20 11:00:52 +0000475 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000476 return (z-(char const *)zIn)-((c==0)?2:0);
danielk1977998b56c2004-05-06 23:37:52 +0000477}
478
drh53c14022007-05-10 17:23:11 +0000479#if defined(SQLITE_TEST)
480/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000481** This routine is called from the TCL test function "translate_selftest".
482** It checks that the primitives for serializing and deserializing
483** characters in each encoding are inverses of each other.
484*/
danielk197744a376f2008-08-12 15:04:58 +0000485void sqlite3UtfSelfTest(void){
drhb3fa0e02006-10-19 01:58:43 +0000486 unsigned int i, t;
danielk1977bfd6cce2004-06-18 04:24:54 +0000487 unsigned char zBuf[20];
488 unsigned char *z;
drh66150952007-07-23 19:12:41 +0000489 unsigned char *zTerm;
danielk1977bfd6cce2004-06-18 04:24:54 +0000490 int n;
drha39f4c52006-10-04 15:23:21 +0000491 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000492
danielk19771ba1b552004-06-23 13:46:32 +0000493 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54 +0000494 z = zBuf;
495 WRITE_UTF8(z, i);
496 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000497 z[0] = 0;
drh66150952007-07-23 19:12:41 +0000498 zTerm = z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000499 z = zBuf;
drh66150952007-07-23 19:12:41 +0000500 c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);
drhb3fa0e02006-10-19 01:58:43 +0000501 t = i;
502 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
503 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
504 assert( c==t );
danielk1977bfd6cce2004-06-18 04:24:54 +0000505 assert( (z-zBuf)==n );
506 }
507 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000508 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000509 z = zBuf;
510 WRITE_UTF16LE(z, i);
511 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000512 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000513 z = zBuf;
514 READ_UTF16LE(z, c);
515 assert( c==i );
516 assert( (z-zBuf)==n );
517 }
518 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000519 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000520 z = zBuf;
521 WRITE_UTF16BE(z, i);
522 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000523 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000524 z = zBuf;
525 READ_UTF16BE(z, c);
526 assert( c==i );
527 assert( (z-zBuf)==n );
528 }
529}
drh6c626082004-11-14 21:56:29 +0000530#endif /* SQLITE_TEST */
531#endif /* SQLITE_OMIT_UTF16 */