blob: d17f532cb6ce3d9c0e7e6a7ad5ff6c60b0df7062 [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drhaa78bec2008-12-09 03:55:14 +000015** $Id: utf.c,v 1.68 2008/12/09 03:55:14 drh Exp $
drha5d14fe2004-05-04 15:00:46 +000016**
17** Notes on UTF-8:
18**
19** Byte-0 Byte-1 Byte-2 Byte-3 Value
20** 0xxxxxxx 00000000 00000000 0xxxxxxx
21** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24**
25**
26** Notes on UTF-16: (with wwww+1==uuuuu)
27**
drh51846b52004-05-28 16:00:21 +000028** Word-0 Word-1 Value
29** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
30** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000031**
danielk1977998b56c2004-05-06 23:37:52 +000032**
drha5d14fe2004-05-04 15:00:46 +000033** BOM or Byte Order Mark:
34** 0xff 0xfe little-endian utf-16 follows
35** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000036**
drha5d14fe2004-05-04 15:00:46 +000037*/
danielk1977998b56c2004-05-06 23:37:52 +000038#include "sqliteInt.h"
drhb659e9b2005-01-28 01:29:08 +000039#include <assert.h>
danielk1977bfd6cce2004-06-18 04:24:54 +000040#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000041
drhb3190c12008-12-08 21:37:14 +000042#ifndef SQLITE_AMALGAMATION
danielk1977998b56c2004-05-06 23:37:52 +000043/*
drh38def052007-03-31 15:27:59 +000044** The following constant value is used by the SQLITE_BIGENDIAN and
45** SQLITE_LITTLEENDIAN macros.
46*/
47const int sqlite3one = 1;
drhb3190c12008-12-08 21:37:14 +000048#endif /* SQLITE_AMALGAMATION */
drh38def052007-03-31 15:27:59 +000049
50/*
drh4a919112007-05-15 11:55:09 +000051** This lookup table is used to help decode the first byte of
52** a multi-byte UTF8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000053*/
drh0a0e1312007-08-07 17:04:59 +000054static const unsigned char sqlite3UtfTrans1[] = {
drh4a919112007-05-15 11:55:09 +000055 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
56 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
57 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
58 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
61 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
62 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
danielk1977bfd6cce2004-06-18 04:24:54 +000063};
64
drh66150952007-07-23 19:12:41 +000065
danielk1977bfd6cce2004-06-18 04:24:54 +000066#define WRITE_UTF8(zOut, c) { \
67 if( c<0x00080 ){ \
drhaa78bec2008-12-09 03:55:14 +000068 *zOut++ = (u8)(c&0xFF); \
danielk1977bfd6cce2004-06-18 04:24:54 +000069 } \
70 else if( c<0x00800 ){ \
drhaa78bec2008-12-09 03:55:14 +000071 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
72 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000073 } \
74 else if( c<0x10000 ){ \
drhaa78bec2008-12-09 03:55:14 +000075 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
76 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
77 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000078 }else{ \
drhaa78bec2008-12-09 03:55:14 +000079 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
80 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
81 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
82 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000083 } \
84}
85
drhaa78bec2008-12-09 03:55:14 +000086#define WRITE_UTF16LE(zOut, c) { \
87 if( c<=0xFFFF ){ \
88 *zOut++ = (u8)(c&0x00FF); \
89 *zOut++ = (u8)((c>>8)&0x00FF); \
90 }else{ \
91 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
92 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
93 *zOut++ = (u8)(c&0x00FF); \
94 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
95 } \
danielk1977bfd6cce2004-06-18 04:24:54 +000096}
97
drhaa78bec2008-12-09 03:55:14 +000098#define WRITE_UTF16BE(zOut, c) { \
99 if( c<=0xFFFF ){ \
100 *zOut++ = (u8)((c>>8)&0x00FF); \
101 *zOut++ = (u8)(c&0x00FF); \
102 }else{ \
103 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
104 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
105 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
106 *zOut++ = (u8)(c&0x00FF); \
107 } \
danielk1977bfd6cce2004-06-18 04:24:54 +0000108}
109
110#define READ_UTF16LE(zIn, c){ \
111 c = (*zIn++); \
112 c += ((*zIn++)<<8); \
danielk1977a9c16b02007-05-16 18:11:41 +0000113 if( c>=0xD800 && c<0xE000 ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000114 int c2 = (*zIn++); \
115 c2 += ((*zIn++)<<8); \
116 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
drhb3fa0e02006-10-19 01:58:43 +0000117 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \
danielk1977bfd6cce2004-06-18 04:24:54 +0000118 } \
119}
120
121#define READ_UTF16BE(zIn, c){ \
122 c = ((*zIn++)<<8); \
123 c += (*zIn++); \
danielk1977a9c16b02007-05-16 18:11:41 +0000124 if( c>=0xD800 && c<0xE000 ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000125 int c2 = ((*zIn++)<<8); \
126 c2 += (*zIn++); \
127 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
drhb3fa0e02006-10-19 01:58:43 +0000128 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \
danielk1977bfd6cce2004-06-18 04:24:54 +0000129 } \
130}
131
132/*
drh66150952007-07-23 19:12:41 +0000133** Translate a single UTF-8 character. Return the unicode value.
134**
135** During translation, assume that the byte that zTerm points
136** is a 0x00.
137**
138** Write a pointer to the next unread byte back into *pzNext.
139**
140** Notes On Invalid UTF-8:
141**
142** * This routine never allows a 7-bit character (0x00 through 0x7f) to
143** be encoded as a multi-byte character. Any multi-byte character that
144** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
145**
146** * This routine never allows a UTF16 surrogate value to be encoded.
147** If a multi-byte character attempts to encode a value between
148** 0xd800 and 0xe000 then it is rendered as 0xfffd.
149**
150** * Bytes in the range of 0x80 through 0xbf which occur as the first
151** byte of a character are interpreted as single-byte characters
152** and rendered as themselves even though they are technically
153** invalid characters.
154**
155** * This routine accepts an infinite number of different UTF8 encodings
156** for unicode values 0x80 and greater. It do not change over-length
157** encodings to 0xfffd as some systems recommend.
158*/
danielk1977ad76a81e2008-07-29 11:25:14 +0000159#define READ_UTF8(zIn, zTerm, c) \
160 c = *(zIn++); \
161 if( c>=0xc0 ){ \
162 c = sqlite3UtfTrans1[c-0xc0]; \
163 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
164 c = (c<<6) + (0x3f & *(zIn++)); \
165 } \
166 if( c<0x80 \
167 || (c&0xFFFFF800)==0xD800 \
168 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
169 }
drh66150952007-07-23 19:12:41 +0000170int sqlite3Utf8Read(
171 const unsigned char *z, /* First byte of UTF-8 character */
172 const unsigned char *zTerm, /* Pretend this byte is 0x00 */
173 const unsigned char **pzNext /* Write first byte past UTF-8 char here */
174){
danielk1977ad76a81e2008-07-29 11:25:14 +0000175 int c;
176 READ_UTF8(z, zTerm, c);
drh66150952007-07-23 19:12:41 +0000177 *pzNext = z;
178 return c;
179}
180
181
182
danielk1977ad76a81e2008-07-29 11:25:14 +0000183
drh66150952007-07-23 19:12:41 +0000184/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000185** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
186** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
187*/
188/* #define TRANSLATE_TRACE 1 */
189
drh6c626082004-11-14 21:56:29 +0000190#ifndef SQLITE_OMIT_UTF16
danielk1977bfd6cce2004-06-18 04:24:54 +0000191/*
192** This routine transforms the internal text encoding used by pMem to
193** desiredEnc. It is an error if the string is already of the desired
194** encoding, or if *pMem does not contain a string value.
195*/
drhb21c8cd2007-08-21 19:33:56 +0000196int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
danielk1977bfd6cce2004-06-18 04:24:54 +0000197 int len; /* Maximum length of output string in bytes */
198 unsigned char *zOut; /* Output buffer */
199 unsigned char *zIn; /* Input iterator */
200 unsigned char *zTerm; /* End of input */
201 unsigned char *z; /* Output iterator */
drha39f4c52006-10-04 15:23:21 +0000202 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000203
drhb21c8cd2007-08-21 19:33:56 +0000204 assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
danielk1977bfd6cce2004-06-18 04:24:54 +0000205 assert( pMem->flags&MEM_Str );
206 assert( pMem->enc!=desiredEnc );
207 assert( pMem->enc!=0 );
208 assert( pMem->n>=0 );
209
danielk1977b5402fb2005-01-12 07:15:04 +0000210#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000211 {
212 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000213 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000214 fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977ad7dd422004-06-06 12:41:49 +0000215 }
216#endif
217
danielk1977bfd6cce2004-06-18 04:24:54 +0000218 /* If the translation is between UTF-16 little and big endian, then
219 ** all that is required is to swap the byte order. This case is handled
220 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000221 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000222 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
223 u8 temp;
drh71c697e2004-08-08 23:39:19 +0000224 int rc;
drhb21c8cd2007-08-21 19:33:56 +0000225 rc = sqlite3VdbeMemMakeWriteable(pMem);
drh71c697e2004-08-08 23:39:19 +0000226 if( rc!=SQLITE_OK ){
227 assert( rc==SQLITE_NOMEM );
228 return SQLITE_NOMEM;
229 }
drh2646da72005-12-09 20:02:05 +0000230 zIn = (u8*)pMem->z;
drhbbf695d2008-11-07 03:29:33 +0000231 zTerm = &zIn[pMem->n&~1];
danielk1977bfd6cce2004-06-18 04:24:54 +0000232 while( zIn<zTerm ){
233 temp = *zIn;
234 *zIn = *(zIn+1);
235 zIn++;
236 *zIn++ = temp;
237 }
238 pMem->enc = desiredEnc;
239 goto translate_out;
240 }
241
danielk1977d7e69642004-06-23 00:23:49 +0000242 /* Set len to the maximum number of bytes required in the output buffer. */
243 if( desiredEnc==SQLITE_UTF8 ){
244 /* When converting from UTF-16, the maximum growth results from
drha49b8612006-04-16 12:05:03 +0000245 ** translating a 2-byte character to a 4-byte UTF-8 character.
246 ** A single byte is required for the output string
danielk1977d7e69642004-06-23 00:23:49 +0000247 ** nul-terminator.
248 */
drhbbf695d2008-11-07 03:29:33 +0000249 pMem->n &= ~1;
drha49b8612006-04-16 12:05:03 +0000250 len = pMem->n * 2 + 1;
danielk1977d7e69642004-06-23 00:23:49 +0000251 }else{
252 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
253 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
254 ** character. Two bytes are required in the output buffer for the
255 ** nul-terminator.
256 */
257 len = pMem->n * 2 + 2;
258 }
259
danielk1977bfd6cce2004-06-18 04:24:54 +0000260 /* Set zIn to point at the start of the input buffer and zTerm to point 1
261 ** byte past the end.
262 **
danielk1977a7a8e142008-02-13 18:25:27 +0000263 ** Variable zOut is set to point at the output buffer, space obtained
264 ** from sqlite3_malloc().
danielk1977bfd6cce2004-06-18 04:24:54 +0000265 */
drh2646da72005-12-09 20:02:05 +0000266 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000267 zTerm = &zIn[pMem->n];
danielk1977a7a8e142008-02-13 18:25:27 +0000268 zOut = sqlite3DbMallocRaw(pMem->db, len);
269 if( !zOut ){
270 return SQLITE_NOMEM;
danielk1977bfd6cce2004-06-18 04:24:54 +0000271 }
272 z = zOut;
273
274 if( pMem->enc==SQLITE_UTF8 ){
275 if( desiredEnc==SQLITE_UTF16LE ){
276 /* UTF-8 -> UTF-16 Little-endian */
277 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000278 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
279 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000280 WRITE_UTF16LE(z, c);
281 }
drhb8dd3152004-09-24 23:20:51 +0000282 }else{
283 assert( desiredEnc==SQLITE_UTF16BE );
danielk1977bfd6cce2004-06-18 04:24:54 +0000284 /* UTF-8 -> UTF-16 Big-endian */
285 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000286 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
287 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000288 WRITE_UTF16BE(z, c);
289 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000290 }
drhb8dd3152004-09-24 23:20:51 +0000291 pMem->n = z - zOut;
292 *z++ = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000293 }else{
294 assert( desiredEnc==SQLITE_UTF8 );
295 if( pMem->enc==SQLITE_UTF16LE ){
296 /* UTF-16 Little-endian -> UTF-8 */
297 while( zIn<zTerm ){
298 READ_UTF16LE(zIn, c);
299 WRITE_UTF8(z, c);
300 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000301 }else{
mihailim7ffb2b52008-06-27 18:59:44 +0000302 /* UTF-16 Big-endian -> UTF-8 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000303 while( zIn<zTerm ){
304 READ_UTF16BE(zIn, c);
305 WRITE_UTF8(z, c);
306 }
danielk1977998b56c2004-05-06 23:37:52 +0000307 }
drhaa78bec2008-12-09 03:55:14 +0000308 pMem->n = (int)(z - zOut);
danielk1977998b56c2004-05-06 23:37:52 +0000309 }
drhb8dd3152004-09-24 23:20:51 +0000310 *z = 0;
danielk1977d7e69642004-06-23 00:23:49 +0000311 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000312
danielk1977bfd6cce2004-06-18 04:24:54 +0000313 sqlite3VdbeMemRelease(pMem);
danielk1977a7a8e142008-02-13 18:25:27 +0000314 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
danielk1977bfd6cce2004-06-18 04:24:54 +0000315 pMem->enc = desiredEnc;
danielk1977a7a8e142008-02-13 18:25:27 +0000316 pMem->flags |= (MEM_Term|MEM_Dyn);
drh2646da72005-12-09 20:02:05 +0000317 pMem->z = (char*)zOut;
danielk19775f096132008-03-28 15:44:09 +0000318 pMem->zMalloc = pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000319
320translate_out:
danielk1977b5402fb2005-01-12 07:15:04 +0000321#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000322 {
323 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000324 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000325 fprintf(stderr, "OUTPUT: %s\n", zBuf);
326 }
327#endif
328 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000329}
330
danielk197793d46752004-05-23 13:30:58 +0000331/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000332** This routine checks for a byte-order mark at the beginning of the
333** UTF-16 string stored in *pMem. If one is present, it is removed and
334** the encoding of the Mem adjusted. This routine does not do any
335** byte-swapping, it just sets Mem.enc appropriately.
336**
337** The allocation (static, dynamic etc.) and encoding of the Mem may be
338** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000339*/
drhb21c8cd2007-08-21 19:33:56 +0000340int sqlite3VdbeMemHandleBom(Mem *pMem){
danielk1977bfd6cce2004-06-18 04:24:54 +0000341 int rc = SQLITE_OK;
342 u8 bom = 0;
343
344 if( pMem->n<0 || pMem->n>1 ){
345 u8 b1 = *(u8 *)pMem->z;
346 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000347 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000348 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000349 }
350 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000351 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000352 }
353 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000354
355 if( bom ){
danielk1977a7a8e142008-02-13 18:25:27 +0000356 rc = sqlite3VdbeMemMakeWriteable(pMem);
357 if( rc==SQLITE_OK ){
358 pMem->n -= 2;
359 memmove(pMem->z, &pMem->z[2], pMem->n);
360 pMem->z[pMem->n] = '\0';
361 pMem->z[pMem->n+1] = '\0';
362 pMem->flags |= MEM_Term;
363 pMem->enc = bom;
danielk1977bfd6cce2004-06-18 04:24:54 +0000364 }
danielk1977998b56c2004-05-06 23:37:52 +0000365 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000366 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000367}
drh6c626082004-11-14 21:56:29 +0000368#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000369
370/*
danielk19776622cce2004-05-20 11:00:52 +0000371** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
372** return the number of unicode characters in pZ up to (but not including)
373** the first 0x00 byte. If nByte is not less than zero, return the
374** number of unicode characters in the first nByte of pZ (or up to
375** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000376*/
drh4a919112007-05-15 11:55:09 +0000377int sqlite3Utf8CharLen(const char *zIn, int nByte){
danielk1977bfd6cce2004-06-18 04:24:54 +0000378 int r = 0;
drh4a919112007-05-15 11:55:09 +0000379 const u8 *z = (const u8*)zIn;
380 const u8 *zTerm;
danielk19771ba1b552004-06-23 13:46:32 +0000381 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000382 zTerm = &z[nByte];
383 }else{
drh4a919112007-05-15 11:55:09 +0000384 zTerm = (const u8*)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000385 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000386 assert( z<=zTerm );
387 while( *z!=0 && z<zTerm ){
drh4a919112007-05-15 11:55:09 +0000388 SQLITE_SKIP_UTF8(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000389 r++;
390 }
391 return r;
danielk19776622cce2004-05-20 11:00:52 +0000392}
393
danielk19774152e672007-09-12 17:01:45 +0000394/* This test function is not currently used by the automated test-suite.
395** Hence it is only available in debug builds.
396*/
397#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
398/*
399** Translate UTF-8 to UTF-8.
400**
401** This has the effect of making sure that the string is well-formed
402** UTF-8. Miscoded characters are removed.
403**
404** The translation is done in-place (since it is impossible for the
405** correct UTF-8 encoding to be longer than a malformed encoding).
406*/
407int sqlite3Utf8To8(unsigned char *zIn){
408 unsigned char *zOut = zIn;
409 unsigned char *zStart = zIn;
danielk19776e891622008-08-12 14:48:40 +0000410 unsigned char *zTerm = &zIn[strlen((char *)zIn)];
danielk19774152e672007-09-12 17:01:45 +0000411 u32 c;
412
413 while( zIn[0] ){
414 c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
415 if( c!=0xfffd ){
416 WRITE_UTF8(zOut, c);
417 }
418 }
419 *zOut = 0;
420 return zOut - zStart;
421}
422#endif
423
drh6c626082004-11-14 21:56:29 +0000424#ifndef SQLITE_OMIT_UTF16
danielk19776622cce2004-05-20 11:00:52 +0000425/*
drhaf9a7c22005-12-15 03:04:10 +0000426** Convert a UTF-16 string in the native encoding into a UTF-8 string.
drh17435752007-08-16 04:30:38 +0000427** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
428** be freed by the calling function.
drhaf9a7c22005-12-15 03:04:10 +0000429**
430** NULL is returned if there is an allocation error.
431*/
drh17435752007-08-16 04:30:38 +0000432char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){
drhaf9a7c22005-12-15 03:04:10 +0000433 Mem m;
434 memset(&m, 0, sizeof(m));
drhb21c8cd2007-08-21 19:33:56 +0000435 m.db = db;
436 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC);
437 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
danielk1977ae72d982007-10-03 08:46:44 +0000438 if( db->mallocFailed ){
439 sqlite3VdbeMemRelease(&m);
440 m.z = 0;
441 }
drh17435752007-08-16 04:30:38 +0000442 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
443 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
444 return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z);
drhaf9a7c22005-12-15 03:04:10 +0000445}
446
447/*
danielk19776622cce2004-05-20 11:00:52 +0000448** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
449** return the number of bytes up to (but not including), the first pair
450** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
451** then return the number of bytes in the first nChar unicode characters
452** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
453*/
drhee858132007-05-08 20:37:38 +0000454int sqlite3Utf16ByteLen(const void *zIn, int nChar){
drha39f4c52006-10-04 15:23:21 +0000455 unsigned int c = 1;
danielk1977bfd6cce2004-06-18 04:24:54 +0000456 char const *z = zIn;
457 int n = 0;
458 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
danielk1977161fb792006-01-24 10:58:21 +0000459 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here
460 ** and in other parts of this file means that at one branch will
461 ** not be covered by coverage testing on any single host. But coverage
462 ** will be complete if the tests are run on both a little-endian and
463 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE
464 ** macros are constant at compile time the compiler can determine
465 ** which branch will be followed. It is therefore assumed that no runtime
466 ** penalty is paid for this "if" statement.
467 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000468 while( c && ((nChar<0) || n<nChar) ){
469 READ_UTF16BE(z, c);
470 n++;
danielk19776622cce2004-05-20 11:00:52 +0000471 }
danielk19776622cce2004-05-20 11:00:52 +0000472 }else{
danielk1977bfd6cce2004-06-18 04:24:54 +0000473 while( c && ((nChar<0) || n<nChar) ){
474 READ_UTF16LE(z, c);
475 n++;
danielk19776622cce2004-05-20 11:00:52 +0000476 }
danielk19776622cce2004-05-20 11:00:52 +0000477 }
drhaa78bec2008-12-09 03:55:14 +0000478 return (int)(z-(char const *)zIn)-((c==0)?2:0);
danielk1977998b56c2004-05-06 23:37:52 +0000479}
480
drh53c14022007-05-10 17:23:11 +0000481#if defined(SQLITE_TEST)
482/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000483** This routine is called from the TCL test function "translate_selftest".
484** It checks that the primitives for serializing and deserializing
485** characters in each encoding are inverses of each other.
486*/
danielk197744a376f2008-08-12 15:04:58 +0000487void sqlite3UtfSelfTest(void){
drhb3fa0e02006-10-19 01:58:43 +0000488 unsigned int i, t;
danielk1977bfd6cce2004-06-18 04:24:54 +0000489 unsigned char zBuf[20];
490 unsigned char *z;
drh66150952007-07-23 19:12:41 +0000491 unsigned char *zTerm;
danielk1977bfd6cce2004-06-18 04:24:54 +0000492 int n;
drha39f4c52006-10-04 15:23:21 +0000493 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000494
danielk19771ba1b552004-06-23 13:46:32 +0000495 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54 +0000496 z = zBuf;
497 WRITE_UTF8(z, i);
498 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000499 z[0] = 0;
drh66150952007-07-23 19:12:41 +0000500 zTerm = z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000501 z = zBuf;
drh66150952007-07-23 19:12:41 +0000502 c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);
drhb3fa0e02006-10-19 01:58:43 +0000503 t = i;
504 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
505 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
506 assert( c==t );
danielk1977bfd6cce2004-06-18 04:24:54 +0000507 assert( (z-zBuf)==n );
508 }
509 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000510 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000511 z = zBuf;
512 WRITE_UTF16LE(z, i);
513 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000514 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000515 z = zBuf;
516 READ_UTF16LE(z, c);
517 assert( c==i );
518 assert( (z-zBuf)==n );
519 }
520 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000521 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000522 z = zBuf;
523 WRITE_UTF16BE(z, i);
524 n = z-zBuf;
drh4a919112007-05-15 11:55:09 +0000525 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000526 z = zBuf;
527 READ_UTF16BE(z, c);
528 assert( c==i );
529 assert( (z-zBuf)==n );
530 }
531}
drh6c626082004-11-14 21:56:29 +0000532#endif /* SQLITE_TEST */
533#endif /* SQLITE_OMIT_UTF16 */