blob: e94815b5ab6fb799244758c355a6030ba5cb31cb [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drha5d14fe2004-05-04 15:00:46 +000015** Notes on UTF-8:
16**
17** Byte-0 Byte-1 Byte-2 Byte-3 Value
18** 0xxxxxxx 00000000 00000000 0xxxxxxx
19** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
20** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
21** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
22**
23**
24** Notes on UTF-16: (with wwww+1==uuuuu)
25**
drh51846b52004-05-28 16:00:21 +000026** Word-0 Word-1 Value
27** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
28** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000029**
danielk1977998b56c2004-05-06 23:37:52 +000030**
drha5d14fe2004-05-04 15:00:46 +000031** BOM or Byte Order Mark:
32** 0xff 0xfe little-endian utf-16 follows
33** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000034**
drha5d14fe2004-05-04 15:00:46 +000035*/
danielk1977998b56c2004-05-06 23:37:52 +000036#include "sqliteInt.h"
drhb659e9b2005-01-28 01:29:08 +000037#include <assert.h>
danielk1977bfd6cce2004-06-18 04:24:54 +000038#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000039
drhb3190c12008-12-08 21:37:14 +000040#ifndef SQLITE_AMALGAMATION
danielk1977998b56c2004-05-06 23:37:52 +000041/*
drh38def052007-03-31 15:27:59 +000042** The following constant value is used by the SQLITE_BIGENDIAN and
43** SQLITE_LITTLEENDIAN macros.
44*/
45const int sqlite3one = 1;
drhb3190c12008-12-08 21:37:14 +000046#endif /* SQLITE_AMALGAMATION */
drh38def052007-03-31 15:27:59 +000047
48/*
drh4a919112007-05-15 11:55:09 +000049** This lookup table is used to help decode the first byte of
50** a multi-byte UTF8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000051*/
shane18e526c2008-12-10 22:30:24 +000052static const unsigned char sqlite3Utf8Trans1[] = {
drh4a919112007-05-15 11:55:09 +000053 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
danielk1977bfd6cce2004-06-18 04:24:54 +000061};
62
drh66150952007-07-23 19:12:41 +000063
danielk1977bfd6cce2004-06-18 04:24:54 +000064#define WRITE_UTF8(zOut, c) { \
65 if( c<0x00080 ){ \
drhaa78bec2008-12-09 03:55:14 +000066 *zOut++ = (u8)(c&0xFF); \
danielk1977bfd6cce2004-06-18 04:24:54 +000067 } \
68 else if( c<0x00800 ){ \
drhaa78bec2008-12-09 03:55:14 +000069 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
70 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000071 } \
72 else if( c<0x10000 ){ \
drhaa78bec2008-12-09 03:55:14 +000073 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
74 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
75 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000076 }else{ \
drhaa78bec2008-12-09 03:55:14 +000077 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
78 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
79 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
80 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000081 } \
82}
83
drhaa78bec2008-12-09 03:55:14 +000084#define WRITE_UTF16LE(zOut, c) { \
85 if( c<=0xFFFF ){ \
86 *zOut++ = (u8)(c&0x00FF); \
87 *zOut++ = (u8)((c>>8)&0x00FF); \
88 }else{ \
89 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
90 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
91 *zOut++ = (u8)(c&0x00FF); \
92 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
93 } \
danielk1977bfd6cce2004-06-18 04:24:54 +000094}
95
drhaa78bec2008-12-09 03:55:14 +000096#define WRITE_UTF16BE(zOut, c) { \
97 if( c<=0xFFFF ){ \
98 *zOut++ = (u8)((c>>8)&0x00FF); \
99 *zOut++ = (u8)(c&0x00FF); \
100 }else{ \
101 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
102 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
103 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
104 *zOut++ = (u8)(c&0x00FF); \
105 } \
danielk1977bfd6cce2004-06-18 04:24:54 +0000106}
107
drh1faca752009-10-24 03:04:10 +0000108#define READ_UTF16LE(zIn, TERM, c){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000109 c = (*zIn++); \
110 c += ((*zIn++)<<8); \
drh1faca752009-10-24 03:04:10 +0000111 if( c>=0xD800 && c<0xE000 && TERM ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000112 int c2 = (*zIn++); \
113 c2 += ((*zIn++)<<8); \
114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
115 } \
116}
117
drh1faca752009-10-24 03:04:10 +0000118#define READ_UTF16BE(zIn, TERM, c){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000119 c = ((*zIn++)<<8); \
120 c += (*zIn++); \
drh1faca752009-10-24 03:04:10 +0000121 if( c>=0xD800 && c<0xE000 && TERM ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000122 int c2 = ((*zIn++)<<8); \
123 c2 += (*zIn++); \
124 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
125 } \
126}
127
128/*
drh66150952007-07-23 19:12:41 +0000129** Translate a single UTF-8 character. Return the unicode value.
130**
131** During translation, assume that the byte that zTerm points
132** is a 0x00.
133**
134** Write a pointer to the next unread byte back into *pzNext.
135**
136** Notes On Invalid UTF-8:
137**
138** * This routine never allows a 7-bit character (0x00 through 0x7f) to
139** be encoded as a multi-byte character. Any multi-byte character that
140** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
141**
142** * This routine never allows a UTF16 surrogate value to be encoded.
143** If a multi-byte character attempts to encode a value between
144** 0xd800 and 0xe000 then it is rendered as 0xfffd.
145**
146** * Bytes in the range of 0x80 through 0xbf which occur as the first
147** byte of a character are interpreted as single-byte characters
148** and rendered as themselves even though they are technically
149** invalid characters.
150**
151** * This routine accepts an infinite number of different UTF8 encodings
152** for unicode values 0x80 and greater. It do not change over-length
153** encodings to 0xfffd as some systems recommend.
154*/
danielk1977ad76a81e2008-07-29 11:25:14 +0000155#define READ_UTF8(zIn, zTerm, c) \
156 c = *(zIn++); \
157 if( c>=0xc0 ){ \
shane18e526c2008-12-10 22:30:24 +0000158 c = sqlite3Utf8Trans1[c-0xc0]; \
danielk1977ad76a81e2008-07-29 11:25:14 +0000159 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
160 c = (c<<6) + (0x3f & *(zIn++)); \
161 } \
162 if( c<0x80 \
163 || (c&0xFFFFF800)==0xD800 \
164 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
165 }
drh0a32fa62011-06-13 12:19:21 +0000166u32 sqlite3Utf8Read(
drh769e97e2009-04-01 16:33:37 +0000167 const unsigned char *zIn, /* First byte of UTF-8 character */
drh66150952007-07-23 19:12:41 +0000168 const unsigned char **pzNext /* Write first byte past UTF-8 char here */
169){
shanehdba2cc42011-03-24 17:43:18 +0000170 unsigned int c;
drh769e97e2009-04-01 16:33:37 +0000171
172 /* Same as READ_UTF8() above but without the zTerm parameter.
173 ** For this routine, we assume the UTF8 string is always zero-terminated.
174 */
175 c = *(zIn++);
176 if( c>=0xc0 ){
177 c = sqlite3Utf8Trans1[c-0xc0];
178 while( (*zIn & 0xc0)==0x80 ){
179 c = (c<<6) + (0x3f & *(zIn++));
180 }
181 if( c<0x80
182 || (c&0xFFFFF800)==0xD800
183 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
184 }
185 *pzNext = zIn;
drh66150952007-07-23 19:12:41 +0000186 return c;
187}
188
189
190
danielk1977ad76a81e2008-07-29 11:25:14 +0000191
drh66150952007-07-23 19:12:41 +0000192/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000193** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
194** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
195*/
196/* #define TRANSLATE_TRACE 1 */
197
drh6c626082004-11-14 21:56:29 +0000198#ifndef SQLITE_OMIT_UTF16
danielk1977bfd6cce2004-06-18 04:24:54 +0000199/*
200** This routine transforms the internal text encoding used by pMem to
201** desiredEnc. It is an error if the string is already of the desired
202** encoding, or if *pMem does not contain a string value.
203*/
drhb21c8cd2007-08-21 19:33:56 +0000204int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
danielk1977bfd6cce2004-06-18 04:24:54 +0000205 int len; /* Maximum length of output string in bytes */
206 unsigned char *zOut; /* Output buffer */
207 unsigned char *zIn; /* Input iterator */
208 unsigned char *zTerm; /* End of input */
209 unsigned char *z; /* Output iterator */
drha39f4c52006-10-04 15:23:21 +0000210 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000211
drhb21c8cd2007-08-21 19:33:56 +0000212 assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
danielk1977bfd6cce2004-06-18 04:24:54 +0000213 assert( pMem->flags&MEM_Str );
214 assert( pMem->enc!=desiredEnc );
215 assert( pMem->enc!=0 );
216 assert( pMem->n>=0 );
217
danielk1977b5402fb2005-01-12 07:15:04 +0000218#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000219 {
220 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000221 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000222 fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977ad7dd422004-06-06 12:41:49 +0000223 }
224#endif
225
danielk1977bfd6cce2004-06-18 04:24:54 +0000226 /* If the translation is between UTF-16 little and big endian, then
227 ** all that is required is to swap the byte order. This case is handled
228 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000229 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000230 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
231 u8 temp;
drh71c697e2004-08-08 23:39:19 +0000232 int rc;
drhb21c8cd2007-08-21 19:33:56 +0000233 rc = sqlite3VdbeMemMakeWriteable(pMem);
drh71c697e2004-08-08 23:39:19 +0000234 if( rc!=SQLITE_OK ){
235 assert( rc==SQLITE_NOMEM );
236 return SQLITE_NOMEM;
237 }
drh2646da72005-12-09 20:02:05 +0000238 zIn = (u8*)pMem->z;
drhbbf695d2008-11-07 03:29:33 +0000239 zTerm = &zIn[pMem->n&~1];
danielk1977bfd6cce2004-06-18 04:24:54 +0000240 while( zIn<zTerm ){
241 temp = *zIn;
242 *zIn = *(zIn+1);
243 zIn++;
244 *zIn++ = temp;
245 }
246 pMem->enc = desiredEnc;
247 goto translate_out;
248 }
249
danielk1977d7e69642004-06-23 00:23:49 +0000250 /* Set len to the maximum number of bytes required in the output buffer. */
251 if( desiredEnc==SQLITE_UTF8 ){
252 /* When converting from UTF-16, the maximum growth results from
drha49b8612006-04-16 12:05:03 +0000253 ** translating a 2-byte character to a 4-byte UTF-8 character.
254 ** A single byte is required for the output string
danielk1977d7e69642004-06-23 00:23:49 +0000255 ** nul-terminator.
256 */
drhbbf695d2008-11-07 03:29:33 +0000257 pMem->n &= ~1;
drha49b8612006-04-16 12:05:03 +0000258 len = pMem->n * 2 + 1;
danielk1977d7e69642004-06-23 00:23:49 +0000259 }else{
260 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
261 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
262 ** character. Two bytes are required in the output buffer for the
263 ** nul-terminator.
264 */
265 len = pMem->n * 2 + 2;
266 }
267
danielk1977bfd6cce2004-06-18 04:24:54 +0000268 /* Set zIn to point at the start of the input buffer and zTerm to point 1
269 ** byte past the end.
270 **
danielk1977a7a8e142008-02-13 18:25:27 +0000271 ** Variable zOut is set to point at the output buffer, space obtained
272 ** from sqlite3_malloc().
danielk1977bfd6cce2004-06-18 04:24:54 +0000273 */
drh2646da72005-12-09 20:02:05 +0000274 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000275 zTerm = &zIn[pMem->n];
danielk1977a7a8e142008-02-13 18:25:27 +0000276 zOut = sqlite3DbMallocRaw(pMem->db, len);
277 if( !zOut ){
278 return SQLITE_NOMEM;
danielk1977bfd6cce2004-06-18 04:24:54 +0000279 }
280 z = zOut;
281
282 if( pMem->enc==SQLITE_UTF8 ){
283 if( desiredEnc==SQLITE_UTF16LE ){
284 /* UTF-8 -> UTF-16 Little-endian */
285 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000286 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
287 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000288 WRITE_UTF16LE(z, c);
289 }
drhb8dd3152004-09-24 23:20:51 +0000290 }else{
291 assert( desiredEnc==SQLITE_UTF16BE );
danielk1977bfd6cce2004-06-18 04:24:54 +0000292 /* UTF-8 -> UTF-16 Big-endian */
293 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000294 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
295 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000296 WRITE_UTF16BE(z, c);
297 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000298 }
drhea678832008-12-10 19:26:22 +0000299 pMem->n = (int)(z - zOut);
drhb8dd3152004-09-24 23:20:51 +0000300 *z++ = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000301 }else{
302 assert( desiredEnc==SQLITE_UTF8 );
303 if( pMem->enc==SQLITE_UTF16LE ){
304 /* UTF-16 Little-endian -> UTF-8 */
305 while( zIn<zTerm ){
drh1faca752009-10-24 03:04:10 +0000306 READ_UTF16LE(zIn, zIn<zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000307 WRITE_UTF8(z, c);
308 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000309 }else{
mihailim7ffb2b52008-06-27 18:59:44 +0000310 /* UTF-16 Big-endian -> UTF-8 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000311 while( zIn<zTerm ){
drh1faca752009-10-24 03:04:10 +0000312 READ_UTF16BE(zIn, zIn<zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000313 WRITE_UTF8(z, c);
314 }
danielk1977998b56c2004-05-06 23:37:52 +0000315 }
drhaa78bec2008-12-09 03:55:14 +0000316 pMem->n = (int)(z - zOut);
danielk1977998b56c2004-05-06 23:37:52 +0000317 }
drhb8dd3152004-09-24 23:20:51 +0000318 *z = 0;
danielk1977d7e69642004-06-23 00:23:49 +0000319 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000320
danielk1977bfd6cce2004-06-18 04:24:54 +0000321 sqlite3VdbeMemRelease(pMem);
danielk1977a7a8e142008-02-13 18:25:27 +0000322 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
danielk1977bfd6cce2004-06-18 04:24:54 +0000323 pMem->enc = desiredEnc;
danielk1977a7a8e142008-02-13 18:25:27 +0000324 pMem->flags |= (MEM_Term|MEM_Dyn);
drh2646da72005-12-09 20:02:05 +0000325 pMem->z = (char*)zOut;
danielk19775f096132008-03-28 15:44:09 +0000326 pMem->zMalloc = pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000327
328translate_out:
danielk1977b5402fb2005-01-12 07:15:04 +0000329#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000330 {
331 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000332 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000333 fprintf(stderr, "OUTPUT: %s\n", zBuf);
334 }
335#endif
336 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000337}
338
danielk197793d46752004-05-23 13:30:58 +0000339/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000340** This routine checks for a byte-order mark at the beginning of the
341** UTF-16 string stored in *pMem. If one is present, it is removed and
342** the encoding of the Mem adjusted. This routine does not do any
343** byte-swapping, it just sets Mem.enc appropriately.
344**
345** The allocation (static, dynamic etc.) and encoding of the Mem may be
346** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000347*/
drhb21c8cd2007-08-21 19:33:56 +0000348int sqlite3VdbeMemHandleBom(Mem *pMem){
danielk1977bfd6cce2004-06-18 04:24:54 +0000349 int rc = SQLITE_OK;
350 u8 bom = 0;
351
drh769e97e2009-04-01 16:33:37 +0000352 assert( pMem->n>=0 );
353 if( pMem->n>1 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000354 u8 b1 = *(u8 *)pMem->z;
355 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000356 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000357 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000358 }
359 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000360 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000361 }
362 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000363
364 if( bom ){
danielk1977a7a8e142008-02-13 18:25:27 +0000365 rc = sqlite3VdbeMemMakeWriteable(pMem);
366 if( rc==SQLITE_OK ){
367 pMem->n -= 2;
368 memmove(pMem->z, &pMem->z[2], pMem->n);
369 pMem->z[pMem->n] = '\0';
370 pMem->z[pMem->n+1] = '\0';
371 pMem->flags |= MEM_Term;
372 pMem->enc = bom;
danielk1977bfd6cce2004-06-18 04:24:54 +0000373 }
danielk1977998b56c2004-05-06 23:37:52 +0000374 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000375 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000376}
drh6c626082004-11-14 21:56:29 +0000377#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000378
379/*
danielk19776622cce2004-05-20 11:00:52 +0000380** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
381** return the number of unicode characters in pZ up to (but not including)
382** the first 0x00 byte. If nByte is not less than zero, return the
383** number of unicode characters in the first nByte of pZ (or up to
384** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000385*/
drh4a919112007-05-15 11:55:09 +0000386int sqlite3Utf8CharLen(const char *zIn, int nByte){
danielk1977bfd6cce2004-06-18 04:24:54 +0000387 int r = 0;
drh4a919112007-05-15 11:55:09 +0000388 const u8 *z = (const u8*)zIn;
389 const u8 *zTerm;
danielk19771ba1b552004-06-23 13:46:32 +0000390 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000391 zTerm = &z[nByte];
392 }else{
drh4a919112007-05-15 11:55:09 +0000393 zTerm = (const u8*)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000394 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000395 assert( z<=zTerm );
396 while( *z!=0 && z<zTerm ){
drh4a919112007-05-15 11:55:09 +0000397 SQLITE_SKIP_UTF8(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000398 r++;
399 }
400 return r;
danielk19776622cce2004-05-20 11:00:52 +0000401}
402
danielk19774152e672007-09-12 17:01:45 +0000403/* This test function is not currently used by the automated test-suite.
404** Hence it is only available in debug builds.
405*/
406#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
407/*
408** Translate UTF-8 to UTF-8.
409**
410** This has the effect of making sure that the string is well-formed
411** UTF-8. Miscoded characters are removed.
412**
shanehdba2cc42011-03-24 17:43:18 +0000413** The translation is done in-place and aborted if the output
414** overruns the input.
danielk19774152e672007-09-12 17:01:45 +0000415*/
416int sqlite3Utf8To8(unsigned char *zIn){
417 unsigned char *zOut = zIn;
418 unsigned char *zStart = zIn;
danielk19774152e672007-09-12 17:01:45 +0000419 u32 c;
420
shanehdba2cc42011-03-24 17:43:18 +0000421 while( zIn[0] && zOut<=zIn ){
drh769e97e2009-04-01 16:33:37 +0000422 c = sqlite3Utf8Read(zIn, (const u8**)&zIn);
danielk19774152e672007-09-12 17:01:45 +0000423 if( c!=0xfffd ){
424 WRITE_UTF8(zOut, c);
425 }
426 }
427 *zOut = 0;
shaneb08a67a2009-03-31 03:41:56 +0000428 return (int)(zOut - zStart);
danielk19774152e672007-09-12 17:01:45 +0000429}
430#endif
431
drh6c626082004-11-14 21:56:29 +0000432#ifndef SQLITE_OMIT_UTF16
danielk19776622cce2004-05-20 11:00:52 +0000433/*
drhaf9a7c22005-12-15 03:04:10 +0000434** Convert a UTF-16 string in the native encoding into a UTF-8 string.
drh17435752007-08-16 04:30:38 +0000435** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
436** be freed by the calling function.
drhaf9a7c22005-12-15 03:04:10 +0000437**
438** NULL is returned if there is an allocation error.
439*/
danb7dca7d2010-03-05 16:32:12 +0000440char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
drhaf9a7c22005-12-15 03:04:10 +0000441 Mem m;
442 memset(&m, 0, sizeof(m));
drhb21c8cd2007-08-21 19:33:56 +0000443 m.db = db;
danb7dca7d2010-03-05 16:32:12 +0000444 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
drhb21c8cd2007-08-21 19:33:56 +0000445 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
danielk1977ae72d982007-10-03 08:46:44 +0000446 if( db->mallocFailed ){
447 sqlite3VdbeMemRelease(&m);
448 m.z = 0;
449 }
drh17435752007-08-16 04:30:38 +0000450 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
451 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
danb7dca7d2010-03-05 16:32:12 +0000452 assert( (m.flags & MEM_Dyn)!=0 || db->mallocFailed );
453 assert( m.z || db->mallocFailed );
454 return m.z;
drhaf9a7c22005-12-15 03:04:10 +0000455}
456
457/*
dan02fa4692009-08-17 17:06:58 +0000458** Convert a UTF-8 string to the UTF-16 encoding specified by parameter
459** enc. A pointer to the new string is returned, and the value of *pnOut
460** is set to the length of the returned string in bytes. The call should
461** arrange to call sqlite3DbFree() on the returned pointer when it is
462** no longer required.
463**
464** If a malloc failure occurs, NULL is returned and the db.mallocFailed
465** flag set.
466*/
drhfaacf172011-08-12 01:51:45 +0000467#ifdef SQLITE_ENABLE_STAT3
shanecea72b22009-09-07 04:38:36 +0000468char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){
dan02fa4692009-08-17 17:06:58 +0000469 Mem m;
470 memset(&m, 0, sizeof(m));
471 m.db = db;
472 sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC);
473 if( sqlite3VdbeMemTranslate(&m, enc) ){
474 assert( db->mallocFailed );
475 return 0;
476 }
477 assert( m.z==m.zMalloc );
478 *pnOut = m.n;
479 return m.z;
480}
dan69188d92009-08-19 08:18:32 +0000481#endif
dan02fa4692009-08-17 17:06:58 +0000482
483/*
drh1faca752009-10-24 03:04:10 +0000484** zIn is a UTF-16 encoded unicode string at least nChar characters long.
drhaed382f2009-04-01 18:40:32 +0000485** Return the number of bytes in the first nChar unicode characters
486** in pZ. nChar must be non-negative.
danielk19776622cce2004-05-20 11:00:52 +0000487*/
drhee858132007-05-08 20:37:38 +0000488int sqlite3Utf16ByteLen(const void *zIn, int nChar){
drhaed382f2009-04-01 18:40:32 +0000489 int c;
490 unsigned char const *z = zIn;
danielk1977bfd6cce2004-06-18 04:24:54 +0000491 int n = 0;
drh6d116ca2009-10-24 01:55:14 +0000492
danielk1977bfd6cce2004-06-18 04:24:54 +0000493 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
drhaed382f2009-04-01 18:40:32 +0000494 while( n<nChar ){
drh1faca752009-10-24 03:04:10 +0000495 READ_UTF16BE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000496 n++;
danielk19776622cce2004-05-20 11:00:52 +0000497 }
danielk19776622cce2004-05-20 11:00:52 +0000498 }else{
drhaed382f2009-04-01 18:40:32 +0000499 while( n<nChar ){
drh1faca752009-10-24 03:04:10 +0000500 READ_UTF16LE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000501 n++;
danielk19776622cce2004-05-20 11:00:52 +0000502 }
danielk19776622cce2004-05-20 11:00:52 +0000503 }
drhaed382f2009-04-01 18:40:32 +0000504 return (int)(z-(unsigned char const *)zIn);
danielk1977998b56c2004-05-06 23:37:52 +0000505}
506
drh53c14022007-05-10 17:23:11 +0000507#if defined(SQLITE_TEST)
508/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000509** This routine is called from the TCL test function "translate_selftest".
510** It checks that the primitives for serializing and deserializing
511** characters in each encoding are inverses of each other.
512*/
danielk197744a376f2008-08-12 15:04:58 +0000513void sqlite3UtfSelfTest(void){
drhb3fa0e02006-10-19 01:58:43 +0000514 unsigned int i, t;
danielk1977bfd6cce2004-06-18 04:24:54 +0000515 unsigned char zBuf[20];
516 unsigned char *z;
517 int n;
drha39f4c52006-10-04 15:23:21 +0000518 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000519
danielk19771ba1b552004-06-23 13:46:32 +0000520 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54 +0000521 z = zBuf;
522 WRITE_UTF8(z, i);
shane18e526c2008-12-10 22:30:24 +0000523 n = (int)(z-zBuf);
524 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09 +0000525 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000526 z = zBuf;
drh769e97e2009-04-01 16:33:37 +0000527 c = sqlite3Utf8Read(z, (const u8**)&z);
drhb3fa0e02006-10-19 01:58:43 +0000528 t = i;
529 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
530 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
531 assert( c==t );
danielk1977bfd6cce2004-06-18 04:24:54 +0000532 assert( (z-zBuf)==n );
533 }
534 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000535 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000536 z = zBuf;
537 WRITE_UTF16LE(z, i);
shane18e526c2008-12-10 22:30:24 +0000538 n = (int)(z-zBuf);
539 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09 +0000540 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000541 z = zBuf;
drh1faca752009-10-24 03:04:10 +0000542 READ_UTF16LE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000543 assert( c==i );
544 assert( (z-zBuf)==n );
545 }
546 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000547 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000548 z = zBuf;
549 WRITE_UTF16BE(z, i);
shane18e526c2008-12-10 22:30:24 +0000550 n = (int)(z-zBuf);
551 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09 +0000552 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000553 z = zBuf;
drh1faca752009-10-24 03:04:10 +0000554 READ_UTF16BE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000555 assert( c==i );
556 assert( (z-zBuf)==n );
557 }
558}
drh6c626082004-11-14 21:56:29 +0000559#endif /* SQLITE_TEST */
560#endif /* SQLITE_OMIT_UTF16 */