blob: 4b04c9e282963b0704c2b17554dbe5b39bc45c07 [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drha5d14fe2004-05-04 15:00:46 +000015** Notes on UTF-8:
16**
17** Byte-0 Byte-1 Byte-2 Byte-3 Value
18** 0xxxxxxx 00000000 00000000 0xxxxxxx
19** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
20** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
21** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
22**
23**
24** Notes on UTF-16: (with wwww+1==uuuuu)
25**
drh51846b52004-05-28 16:00:21 +000026** Word-0 Word-1 Value
27** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
28** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000029**
danielk1977998b56c2004-05-06 23:37:52 +000030**
drha5d14fe2004-05-04 15:00:46 +000031** BOM or Byte Order Mark:
32** 0xff 0xfe little-endian utf-16 follows
33** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000034**
drha5d14fe2004-05-04 15:00:46 +000035*/
danielk1977998b56c2004-05-06 23:37:52 +000036#include "sqliteInt.h"
drhb659e9b2005-01-28 01:29:08 +000037#include <assert.h>
danielk1977bfd6cce2004-06-18 04:24:54 +000038#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000039
drhe1462a72015-12-24 14:53:27 +000040#if !defined(SQLITE_AMALGAMATION) && SQLITE_BYTEORDER==0
danielk1977998b56c2004-05-06 23:37:52 +000041/*
drh38def052007-03-31 15:27:59 +000042** The following constant value is used by the SQLITE_BIGENDIAN and
43** SQLITE_LITTLEENDIAN macros.
44*/
45const int sqlite3one = 1;
drhe1462a72015-12-24 14:53:27 +000046#endif /* SQLITE_AMALGAMATION && SQLITE_BYTEORDER==0 */
drh38def052007-03-31 15:27:59 +000047
48/*
drh4a919112007-05-15 11:55:09 +000049** This lookup table is used to help decode the first byte of
50** a multi-byte UTF8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000051*/
shane18e526c2008-12-10 22:30:24 +000052static const unsigned char sqlite3Utf8Trans1[] = {
drh4a919112007-05-15 11:55:09 +000053 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
danielk1977bfd6cce2004-06-18 04:24:54 +000061};
62
drh66150952007-07-23 19:12:41 +000063
danielk1977bfd6cce2004-06-18 04:24:54 +000064#define WRITE_UTF8(zOut, c) { \
65 if( c<0x00080 ){ \
drhaa78bec2008-12-09 03:55:14 +000066 *zOut++ = (u8)(c&0xFF); \
danielk1977bfd6cce2004-06-18 04:24:54 +000067 } \
68 else if( c<0x00800 ){ \
drhaa78bec2008-12-09 03:55:14 +000069 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
70 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000071 } \
72 else if( c<0x10000 ){ \
drhaa78bec2008-12-09 03:55:14 +000073 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
74 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
75 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000076 }else{ \
drhaa78bec2008-12-09 03:55:14 +000077 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
78 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
79 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
80 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000081 } \
82}
83
drhaa78bec2008-12-09 03:55:14 +000084#define WRITE_UTF16LE(zOut, c) { \
85 if( c<=0xFFFF ){ \
86 *zOut++ = (u8)(c&0x00FF); \
87 *zOut++ = (u8)((c>>8)&0x00FF); \
88 }else{ \
89 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
90 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
91 *zOut++ = (u8)(c&0x00FF); \
92 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
93 } \
danielk1977bfd6cce2004-06-18 04:24:54 +000094}
95
drhaa78bec2008-12-09 03:55:14 +000096#define WRITE_UTF16BE(zOut, c) { \
97 if( c<=0xFFFF ){ \
98 *zOut++ = (u8)((c>>8)&0x00FF); \
99 *zOut++ = (u8)(c&0x00FF); \
100 }else{ \
101 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
102 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
103 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
104 *zOut++ = (u8)(c&0x00FF); \
105 } \
danielk1977bfd6cce2004-06-18 04:24:54 +0000106}
107
drh1faca752009-10-24 03:04:10 +0000108#define READ_UTF16LE(zIn, TERM, c){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000109 c = (*zIn++); \
110 c += ((*zIn++)<<8); \
drh1faca752009-10-24 03:04:10 +0000111 if( c>=0xD800 && c<0xE000 && TERM ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000112 int c2 = (*zIn++); \
113 c2 += ((*zIn++)<<8); \
114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
115 } \
116}
117
drh1faca752009-10-24 03:04:10 +0000118#define READ_UTF16BE(zIn, TERM, c){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000119 c = ((*zIn++)<<8); \
120 c += (*zIn++); \
drh1faca752009-10-24 03:04:10 +0000121 if( c>=0xD800 && c<0xE000 && TERM ){ \
danielk1977bfd6cce2004-06-18 04:24:54 +0000122 int c2 = ((*zIn++)<<8); \
123 c2 += (*zIn++); \
124 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
125 } \
126}
127
128/*
drh66150952007-07-23 19:12:41 +0000129** Translate a single UTF-8 character. Return the unicode value.
130**
131** During translation, assume that the byte that zTerm points
132** is a 0x00.
133**
134** Write a pointer to the next unread byte back into *pzNext.
135**
136** Notes On Invalid UTF-8:
137**
138** * This routine never allows a 7-bit character (0x00 through 0x7f) to
139** be encoded as a multi-byte character. Any multi-byte character that
140** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
141**
142** * This routine never allows a UTF16 surrogate value to be encoded.
143** If a multi-byte character attempts to encode a value between
144** 0xd800 and 0xe000 then it is rendered as 0xfffd.
145**
146** * Bytes in the range of 0x80 through 0xbf which occur as the first
147** byte of a character are interpreted as single-byte characters
148** and rendered as themselves even though they are technically
149** invalid characters.
150**
drh6c34e582014-06-18 15:24:40 +0000151** * This routine accepts over-length UTF8 encodings
152** for unicode values 0x80 and greater. It does not change over-length
drh66150952007-07-23 19:12:41 +0000153** encodings to 0xfffd as some systems recommend.
154*/
danielk1977ad76a81e2008-07-29 11:25:14 +0000155#define READ_UTF8(zIn, zTerm, c) \
156 c = *(zIn++); \
157 if( c>=0xc0 ){ \
shane18e526c2008-12-10 22:30:24 +0000158 c = sqlite3Utf8Trans1[c-0xc0]; \
danielk1977ad76a81e2008-07-29 11:25:14 +0000159 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
160 c = (c<<6) + (0x3f & *(zIn++)); \
161 } \
162 if( c<0x80 \
163 || (c&0xFFFFF800)==0xD800 \
164 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
165 }
drh0a32fa62011-06-13 12:19:21 +0000166u32 sqlite3Utf8Read(
drh42610962012-09-17 18:56:32 +0000167 const unsigned char **pz /* Pointer to string from which to read char */
drh66150952007-07-23 19:12:41 +0000168){
shanehdba2cc42011-03-24 17:43:18 +0000169 unsigned int c;
drh769e97e2009-04-01 16:33:37 +0000170
171 /* Same as READ_UTF8() above but without the zTerm parameter.
172 ** For this routine, we assume the UTF8 string is always zero-terminated.
173 */
drh42610962012-09-17 18:56:32 +0000174 c = *((*pz)++);
drh769e97e2009-04-01 16:33:37 +0000175 if( c>=0xc0 ){
176 c = sqlite3Utf8Trans1[c-0xc0];
drh42610962012-09-17 18:56:32 +0000177 while( (*(*pz) & 0xc0)==0x80 ){
178 c = (c<<6) + (0x3f & *((*pz)++));
drh769e97e2009-04-01 16:33:37 +0000179 }
180 if( c<0x80
181 || (c&0xFFFFF800)==0xD800
182 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
183 }
drh66150952007-07-23 19:12:41 +0000184 return c;
185}
186
187
188
danielk1977ad76a81e2008-07-29 11:25:14 +0000189
drh66150952007-07-23 19:12:41 +0000190/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000191** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
192** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
193*/
194/* #define TRANSLATE_TRACE 1 */
195
drh6c626082004-11-14 21:56:29 +0000196#ifndef SQLITE_OMIT_UTF16
danielk1977bfd6cce2004-06-18 04:24:54 +0000197/*
198** This routine transforms the internal text encoding used by pMem to
199** desiredEnc. It is an error if the string is already of the desired
200** encoding, or if *pMem does not contain a string value.
201*/
drh4274dae2014-08-24 02:53:23 +0000202SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
danielk1977bfd6cce2004-06-18 04:24:54 +0000203 int len; /* Maximum length of output string in bytes */
204 unsigned char *zOut; /* Output buffer */
205 unsigned char *zIn; /* Input iterator */
206 unsigned char *zTerm; /* End of input */
207 unsigned char *z; /* Output iterator */
drha39f4c52006-10-04 15:23:21 +0000208 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000209
drhb21c8cd2007-08-21 19:33:56 +0000210 assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
danielk1977bfd6cce2004-06-18 04:24:54 +0000211 assert( pMem->flags&MEM_Str );
212 assert( pMem->enc!=desiredEnc );
213 assert( pMem->enc!=0 );
214 assert( pMem->n>=0 );
215
danielk1977b5402fb2005-01-12 07:15:04 +0000216#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000217 {
218 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000219 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000220 fprintf(stderr, "INPUT: %s\n", zBuf);
danielk1977ad7dd422004-06-06 12:41:49 +0000221 }
222#endif
223
danielk1977bfd6cce2004-06-18 04:24:54 +0000224 /* If the translation is between UTF-16 little and big endian, then
225 ** all that is required is to swap the byte order. This case is handled
226 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000227 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000228 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
229 u8 temp;
drh71c697e2004-08-08 23:39:19 +0000230 int rc;
drhb21c8cd2007-08-21 19:33:56 +0000231 rc = sqlite3VdbeMemMakeWriteable(pMem);
drh71c697e2004-08-08 23:39:19 +0000232 if( rc!=SQLITE_OK ){
233 assert( rc==SQLITE_NOMEM );
mistachkinfad30392016-02-13 23:43:46 +0000234 return SQLITE_NOMEM_BKPT;
drh71c697e2004-08-08 23:39:19 +0000235 }
drh2646da72005-12-09 20:02:05 +0000236 zIn = (u8*)pMem->z;
drhbbf695d2008-11-07 03:29:33 +0000237 zTerm = &zIn[pMem->n&~1];
danielk1977bfd6cce2004-06-18 04:24:54 +0000238 while( zIn<zTerm ){
239 temp = *zIn;
240 *zIn = *(zIn+1);
241 zIn++;
242 *zIn++ = temp;
243 }
244 pMem->enc = desiredEnc;
245 goto translate_out;
246 }
247
danielk1977d7e69642004-06-23 00:23:49 +0000248 /* Set len to the maximum number of bytes required in the output buffer. */
249 if( desiredEnc==SQLITE_UTF8 ){
250 /* When converting from UTF-16, the maximum growth results from
drha49b8612006-04-16 12:05:03 +0000251 ** translating a 2-byte character to a 4-byte UTF-8 character.
252 ** A single byte is required for the output string
danielk1977d7e69642004-06-23 00:23:49 +0000253 ** nul-terminator.
254 */
drhbbf695d2008-11-07 03:29:33 +0000255 pMem->n &= ~1;
drha49b8612006-04-16 12:05:03 +0000256 len = pMem->n * 2 + 1;
danielk1977d7e69642004-06-23 00:23:49 +0000257 }else{
258 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
259 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
260 ** character. Two bytes are required in the output buffer for the
261 ** nul-terminator.
262 */
263 len = pMem->n * 2 + 2;
264 }
265
danielk1977bfd6cce2004-06-18 04:24:54 +0000266 /* Set zIn to point at the start of the input buffer and zTerm to point 1
267 ** byte past the end.
268 **
danielk1977a7a8e142008-02-13 18:25:27 +0000269 ** Variable zOut is set to point at the output buffer, space obtained
270 ** from sqlite3_malloc().
danielk1977bfd6cce2004-06-18 04:24:54 +0000271 */
drh2646da72005-12-09 20:02:05 +0000272 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000273 zTerm = &zIn[pMem->n];
danielk1977a7a8e142008-02-13 18:25:27 +0000274 zOut = sqlite3DbMallocRaw(pMem->db, len);
275 if( !zOut ){
mistachkinfad30392016-02-13 23:43:46 +0000276 return SQLITE_NOMEM_BKPT;
danielk1977bfd6cce2004-06-18 04:24:54 +0000277 }
278 z = zOut;
279
280 if( pMem->enc==SQLITE_UTF8 ){
281 if( desiredEnc==SQLITE_UTF16LE ){
282 /* UTF-8 -> UTF-16 Little-endian */
283 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000284 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000285 WRITE_UTF16LE(z, c);
286 }
drhb8dd3152004-09-24 23:20:51 +0000287 }else{
288 assert( desiredEnc==SQLITE_UTF16BE );
danielk1977bfd6cce2004-06-18 04:24:54 +0000289 /* UTF-8 -> UTF-16 Big-endian */
290 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000291 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000292 WRITE_UTF16BE(z, c);
293 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000294 }
drhea678832008-12-10 19:26:22 +0000295 pMem->n = (int)(z - zOut);
drhb8dd3152004-09-24 23:20:51 +0000296 *z++ = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000297 }else{
298 assert( desiredEnc==SQLITE_UTF8 );
299 if( pMem->enc==SQLITE_UTF16LE ){
300 /* UTF-16 Little-endian -> UTF-8 */
301 while( zIn<zTerm ){
drh1faca752009-10-24 03:04:10 +0000302 READ_UTF16LE(zIn, zIn<zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000303 WRITE_UTF8(z, c);
304 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000305 }else{
mihailim7ffb2b52008-06-27 18:59:44 +0000306 /* UTF-16 Big-endian -> UTF-8 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000307 while( zIn<zTerm ){
drh1faca752009-10-24 03:04:10 +0000308 READ_UTF16BE(zIn, zIn<zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000309 WRITE_UTF8(z, c);
310 }
danielk1977998b56c2004-05-06 23:37:52 +0000311 }
drhaa78bec2008-12-09 03:55:14 +0000312 pMem->n = (int)(z - zOut);
danielk1977998b56c2004-05-06 23:37:52 +0000313 }
drhb8dd3152004-09-24 23:20:51 +0000314 *z = 0;
danielk1977d7e69642004-06-23 00:23:49 +0000315 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000316
drh6b478bc2014-09-16 21:54:11 +0000317 c = pMem->flags;
danielk1977bfd6cce2004-06-18 04:24:54 +0000318 sqlite3VdbeMemRelease(pMem);
dan5b6c8e42016-01-30 15:46:03 +0000319 pMem->flags = MEM_Str|MEM_Term|(c&(MEM_AffMask|MEM_Subtype));
danielk1977bfd6cce2004-06-18 04:24:54 +0000320 pMem->enc = desiredEnc;
drh2646da72005-12-09 20:02:05 +0000321 pMem->z = (char*)zOut;
danielk19775f096132008-03-28 15:44:09 +0000322 pMem->zMalloc = pMem->z;
drh17bcb102014-09-18 21:25:33 +0000323 pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000324
325translate_out:
danielk1977b5402fb2005-01-12 07:15:04 +0000326#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000327 {
328 char zBuf[100];
drh74161702006-02-24 02:53:49 +0000329 sqlite3VdbeMemPrettyPrint(pMem, zBuf);
danielk1977bfd6cce2004-06-18 04:24:54 +0000330 fprintf(stderr, "OUTPUT: %s\n", zBuf);
331 }
332#endif
333 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000334}
drhf0f44b72017-07-12 12:19:33 +0000335#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000336
drhf0f44b72017-07-12 12:19:33 +0000337#ifndef SQLITE_OMIT_UTF16
danielk197793d46752004-05-23 13:30:58 +0000338/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000339** This routine checks for a byte-order mark at the beginning of the
340** UTF-16 string stored in *pMem. If one is present, it is removed and
341** the encoding of the Mem adjusted. This routine does not do any
342** byte-swapping, it just sets Mem.enc appropriately.
343**
344** The allocation (static, dynamic etc.) and encoding of the Mem may be
345** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000346*/
drhb21c8cd2007-08-21 19:33:56 +0000347int sqlite3VdbeMemHandleBom(Mem *pMem){
danielk1977bfd6cce2004-06-18 04:24:54 +0000348 int rc = SQLITE_OK;
349 u8 bom = 0;
350
drh769e97e2009-04-01 16:33:37 +0000351 assert( pMem->n>=0 );
352 if( pMem->n>1 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000353 u8 b1 = *(u8 *)pMem->z;
354 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000355 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000356 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000357 }
358 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000359 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000360 }
361 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000362
363 if( bom ){
danielk1977a7a8e142008-02-13 18:25:27 +0000364 rc = sqlite3VdbeMemMakeWriteable(pMem);
365 if( rc==SQLITE_OK ){
366 pMem->n -= 2;
367 memmove(pMem->z, &pMem->z[2], pMem->n);
368 pMem->z[pMem->n] = '\0';
369 pMem->z[pMem->n+1] = '\0';
370 pMem->flags |= MEM_Term;
371 pMem->enc = bom;
danielk1977bfd6cce2004-06-18 04:24:54 +0000372 }
danielk1977998b56c2004-05-06 23:37:52 +0000373 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000374 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000375}
drh6c626082004-11-14 21:56:29 +0000376#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000377
378/*
danielk19776622cce2004-05-20 11:00:52 +0000379** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
380** return the number of unicode characters in pZ up to (but not including)
381** the first 0x00 byte. If nByte is not less than zero, return the
382** number of unicode characters in the first nByte of pZ (or up to
383** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000384*/
drh4a919112007-05-15 11:55:09 +0000385int sqlite3Utf8CharLen(const char *zIn, int nByte){
danielk1977bfd6cce2004-06-18 04:24:54 +0000386 int r = 0;
drh4a919112007-05-15 11:55:09 +0000387 const u8 *z = (const u8*)zIn;
388 const u8 *zTerm;
danielk19771ba1b552004-06-23 13:46:32 +0000389 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000390 zTerm = &z[nByte];
391 }else{
drh4a919112007-05-15 11:55:09 +0000392 zTerm = (const u8*)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000393 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000394 assert( z<=zTerm );
395 while( *z!=0 && z<zTerm ){
drh4a919112007-05-15 11:55:09 +0000396 SQLITE_SKIP_UTF8(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000397 r++;
398 }
399 return r;
danielk19776622cce2004-05-20 11:00:52 +0000400}
401
danielk19774152e672007-09-12 17:01:45 +0000402/* This test function is not currently used by the automated test-suite.
403** Hence it is only available in debug builds.
404*/
405#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
406/*
407** Translate UTF-8 to UTF-8.
408**
409** This has the effect of making sure that the string is well-formed
410** UTF-8. Miscoded characters are removed.
411**
shanehdba2cc42011-03-24 17:43:18 +0000412** The translation is done in-place and aborted if the output
413** overruns the input.
danielk19774152e672007-09-12 17:01:45 +0000414*/
415int sqlite3Utf8To8(unsigned char *zIn){
416 unsigned char *zOut = zIn;
417 unsigned char *zStart = zIn;
danielk19774152e672007-09-12 17:01:45 +0000418 u32 c;
419
shanehdba2cc42011-03-24 17:43:18 +0000420 while( zIn[0] && zOut<=zIn ){
drh42610962012-09-17 18:56:32 +0000421 c = sqlite3Utf8Read((const u8**)&zIn);
danielk19774152e672007-09-12 17:01:45 +0000422 if( c!=0xfffd ){
423 WRITE_UTF8(zOut, c);
424 }
425 }
426 *zOut = 0;
shaneb08a67a2009-03-31 03:41:56 +0000427 return (int)(zOut - zStart);
danielk19774152e672007-09-12 17:01:45 +0000428}
429#endif
430
drh6c626082004-11-14 21:56:29 +0000431#ifndef SQLITE_OMIT_UTF16
danielk19776622cce2004-05-20 11:00:52 +0000432/*
drhaf9a7c22005-12-15 03:04:10 +0000433** Convert a UTF-16 string in the native encoding into a UTF-8 string.
drh17435752007-08-16 04:30:38 +0000434** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
435** be freed by the calling function.
drhaf9a7c22005-12-15 03:04:10 +0000436**
437** NULL is returned if there is an allocation error.
438*/
danb7dca7d2010-03-05 16:32:12 +0000439char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
drhaf9a7c22005-12-15 03:04:10 +0000440 Mem m;
441 memset(&m, 0, sizeof(m));
drhb21c8cd2007-08-21 19:33:56 +0000442 m.db = db;
danb7dca7d2010-03-05 16:32:12 +0000443 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
drhb21c8cd2007-08-21 19:33:56 +0000444 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
danielk1977ae72d982007-10-03 08:46:44 +0000445 if( db->mallocFailed ){
446 sqlite3VdbeMemRelease(&m);
447 m.z = 0;
448 }
drh17435752007-08-16 04:30:38 +0000449 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
450 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
danb7dca7d2010-03-05 16:32:12 +0000451 assert( m.z || db->mallocFailed );
452 return m.z;
drhaf9a7c22005-12-15 03:04:10 +0000453}
454
455/*
drh1faca752009-10-24 03:04:10 +0000456** zIn is a UTF-16 encoded unicode string at least nChar characters long.
drhaed382f2009-04-01 18:40:32 +0000457** Return the number of bytes in the first nChar unicode characters
458** in pZ. nChar must be non-negative.
danielk19776622cce2004-05-20 11:00:52 +0000459*/
drhee858132007-05-08 20:37:38 +0000460int sqlite3Utf16ByteLen(const void *zIn, int nChar){
drhaed382f2009-04-01 18:40:32 +0000461 int c;
462 unsigned char const *z = zIn;
danielk1977bfd6cce2004-06-18 04:24:54 +0000463 int n = 0;
drh6d116ca2009-10-24 01:55:14 +0000464
danielk1977bfd6cce2004-06-18 04:24:54 +0000465 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
drhaed382f2009-04-01 18:40:32 +0000466 while( n<nChar ){
drh1faca752009-10-24 03:04:10 +0000467 READ_UTF16BE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000468 n++;
danielk19776622cce2004-05-20 11:00:52 +0000469 }
danielk19776622cce2004-05-20 11:00:52 +0000470 }else{
drhaed382f2009-04-01 18:40:32 +0000471 while( n<nChar ){
drh1faca752009-10-24 03:04:10 +0000472 READ_UTF16LE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000473 n++;
danielk19776622cce2004-05-20 11:00:52 +0000474 }
danielk19776622cce2004-05-20 11:00:52 +0000475 }
drhaed382f2009-04-01 18:40:32 +0000476 return (int)(z-(unsigned char const *)zIn);
danielk1977998b56c2004-05-06 23:37:52 +0000477}
478
drh53c14022007-05-10 17:23:11 +0000479#if defined(SQLITE_TEST)
480/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000481** This routine is called from the TCL test function "translate_selftest".
482** It checks that the primitives for serializing and deserializing
483** characters in each encoding are inverses of each other.
484*/
danielk197744a376f2008-08-12 15:04:58 +0000485void sqlite3UtfSelfTest(void){
drhb3fa0e02006-10-19 01:58:43 +0000486 unsigned int i, t;
danielk1977bfd6cce2004-06-18 04:24:54 +0000487 unsigned char zBuf[20];
488 unsigned char *z;
489 int n;
drha39f4c52006-10-04 15:23:21 +0000490 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000491
danielk19771ba1b552004-06-23 13:46:32 +0000492 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54 +0000493 z = zBuf;
494 WRITE_UTF8(z, i);
shane18e526c2008-12-10 22:30:24 +0000495 n = (int)(z-zBuf);
496 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09 +0000497 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000498 z = zBuf;
drh42610962012-09-17 18:56:32 +0000499 c = sqlite3Utf8Read((const u8**)&z);
drhb3fa0e02006-10-19 01:58:43 +0000500 t = i;
501 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
502 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
503 assert( c==t );
danielk1977bfd6cce2004-06-18 04:24:54 +0000504 assert( (z-zBuf)==n );
505 }
506 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000507 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000508 z = zBuf;
509 WRITE_UTF16LE(z, i);
shane18e526c2008-12-10 22:30:24 +0000510 n = (int)(z-zBuf);
511 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09 +0000512 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000513 z = zBuf;
drh1faca752009-10-24 03:04:10 +0000514 READ_UTF16LE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000515 assert( c==i );
516 assert( (z-zBuf)==n );
517 }
518 for(i=0; i<0x00110000; i++){
danielk1977a9c16b02007-05-16 18:11:41 +0000519 if( i>=0xD800 && i<0xE000 ) continue;
danielk1977bfd6cce2004-06-18 04:24:54 +0000520 z = zBuf;
521 WRITE_UTF16BE(z, i);
shane18e526c2008-12-10 22:30:24 +0000522 n = (int)(z-zBuf);
523 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09 +0000524 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000525 z = zBuf;
drh1faca752009-10-24 03:04:10 +0000526 READ_UTF16BE(z, 1, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000527 assert( c==i );
528 assert( (z-zBuf)==n );
529 }
530}
drh6c626082004-11-14 21:56:29 +0000531#endif /* SQLITE_TEST */
532#endif /* SQLITE_OMIT_UTF16 */