blob: 5f27babdfce79a0ad571447d2d6796812cbbdb9f [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drha5d14fe2004-05-04 15:00:46 +000015** Notes on UTF-8:
16**
17** Byte-0 Byte-1 Byte-2 Byte-3 Value
18** 0xxxxxxx 00000000 00000000 0xxxxxxx
19** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
20** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
21** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
22**
23**
24** Notes on UTF-16: (with wwww+1==uuuuu)
25**
drh51846b52004-05-28 16:00:21 +000026** Word-0 Word-1 Value
27** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
28** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000029**
danielk1977998b56c2004-05-06 23:37:52 +000030**
drha5d14fe2004-05-04 15:00:46 +000031** BOM or Byte Order Mark:
32** 0xff 0xfe little-endian utf-16 follows
33** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000034**
drha5d14fe2004-05-04 15:00:46 +000035*/
danielk1977998b56c2004-05-06 23:37:52 +000036#include "sqliteInt.h"
drhb659e9b2005-01-28 01:29:08 +000037#include <assert.h>
danielk1977bfd6cce2004-06-18 04:24:54 +000038#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:52 +000039
drhe1462a72015-12-24 14:53:27 +000040#if !defined(SQLITE_AMALGAMATION) && SQLITE_BYTEORDER==0
danielk1977998b56c2004-05-06 23:37:52 +000041/*
drh38def052007-03-31 15:27:59 +000042** The following constant value is used by the SQLITE_BIGENDIAN and
43** SQLITE_LITTLEENDIAN macros.
44*/
45const int sqlite3one = 1;
drhe1462a72015-12-24 14:53:27 +000046#endif /* SQLITE_AMALGAMATION && SQLITE_BYTEORDER==0 */
drh38def052007-03-31 15:27:59 +000047
48/*
drh4a919112007-05-15 11:55:09 +000049** This lookup table is used to help decode the first byte of
50** a multi-byte UTF8 character.
danielk1977d02eb1f2004-06-06 09:44:03 +000051*/
shane18e526c2008-12-10 22:30:24 +000052static const unsigned char sqlite3Utf8Trans1[] = {
drh4a919112007-05-15 11:55:09 +000053 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
danielk1977bfd6cce2004-06-18 04:24:54 +000061};
62
drh66150952007-07-23 19:12:41 +000063
danielk1977bfd6cce2004-06-18 04:24:54 +000064#define WRITE_UTF8(zOut, c) { \
65 if( c<0x00080 ){ \
drhaa78bec2008-12-09 03:55:14 +000066 *zOut++ = (u8)(c&0xFF); \
danielk1977bfd6cce2004-06-18 04:24:54 +000067 } \
68 else if( c<0x00800 ){ \
drhaa78bec2008-12-09 03:55:14 +000069 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
70 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000071 } \
72 else if( c<0x10000 ){ \
drhaa78bec2008-12-09 03:55:14 +000073 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
74 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
75 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000076 }else{ \
drhaa78bec2008-12-09 03:55:14 +000077 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
78 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
79 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
80 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:54 +000081 } \
82}
83
drhaa78bec2008-12-09 03:55:14 +000084#define WRITE_UTF16LE(zOut, c) { \
85 if( c<=0xFFFF ){ \
86 *zOut++ = (u8)(c&0x00FF); \
87 *zOut++ = (u8)((c>>8)&0x00FF); \
88 }else{ \
89 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
90 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
91 *zOut++ = (u8)(c&0x00FF); \
92 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
93 } \
danielk1977bfd6cce2004-06-18 04:24:54 +000094}
95
drhaa78bec2008-12-09 03:55:14 +000096#define WRITE_UTF16BE(zOut, c) { \
97 if( c<=0xFFFF ){ \
98 *zOut++ = (u8)((c>>8)&0x00FF); \
99 *zOut++ = (u8)(c&0x00FF); \
100 }else{ \
101 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
102 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
103 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
104 *zOut++ = (u8)(c&0x00FF); \
105 } \
danielk1977bfd6cce2004-06-18 04:24:54 +0000106}
107
danielk1977bfd6cce2004-06-18 04:24:54 +0000108/*
drh66150952007-07-23 19:12:41 +0000109** Translate a single UTF-8 character. Return the unicode value.
110**
111** During translation, assume that the byte that zTerm points
112** is a 0x00.
113**
114** Write a pointer to the next unread byte back into *pzNext.
115**
116** Notes On Invalid UTF-8:
117**
118** * This routine never allows a 7-bit character (0x00 through 0x7f) to
119** be encoded as a multi-byte character. Any multi-byte character that
120** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
121**
122** * This routine never allows a UTF16 surrogate value to be encoded.
123** If a multi-byte character attempts to encode a value between
124** 0xd800 and 0xe000 then it is rendered as 0xfffd.
125**
126** * Bytes in the range of 0x80 through 0xbf which occur as the first
127** byte of a character are interpreted as single-byte characters
128** and rendered as themselves even though they are technically
129** invalid characters.
130**
drh6c34e582014-06-18 15:24:40 +0000131** * This routine accepts over-length UTF8 encodings
132** for unicode values 0x80 and greater. It does not change over-length
drh66150952007-07-23 19:12:41 +0000133** encodings to 0xfffd as some systems recommend.
134*/
danielk1977ad76a81e2008-07-29 11:25:14 +0000135#define READ_UTF8(zIn, zTerm, c) \
136 c = *(zIn++); \
137 if( c>=0xc0 ){ \
shane18e526c2008-12-10 22:30:24 +0000138 c = sqlite3Utf8Trans1[c-0xc0]; \
danielk1977ad76a81e2008-07-29 11:25:14 +0000139 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
140 c = (c<<6) + (0x3f & *(zIn++)); \
141 } \
142 if( c<0x80 \
143 || (c&0xFFFFF800)==0xD800 \
144 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
145 }
drh0a32fa62011-06-13 12:19:21 +0000146u32 sqlite3Utf8Read(
drh42610962012-09-17 18:56:32 +0000147 const unsigned char **pz /* Pointer to string from which to read char */
drh66150952007-07-23 19:12:41 +0000148){
shanehdba2cc42011-03-24 17:43:18 +0000149 unsigned int c;
drh769e97e2009-04-01 16:33:37 +0000150
151 /* Same as READ_UTF8() above but without the zTerm parameter.
152 ** For this routine, we assume the UTF8 string is always zero-terminated.
153 */
drh42610962012-09-17 18:56:32 +0000154 c = *((*pz)++);
drh769e97e2009-04-01 16:33:37 +0000155 if( c>=0xc0 ){
156 c = sqlite3Utf8Trans1[c-0xc0];
drh42610962012-09-17 18:56:32 +0000157 while( (*(*pz) & 0xc0)==0x80 ){
158 c = (c<<6) + (0x3f & *((*pz)++));
drh769e97e2009-04-01 16:33:37 +0000159 }
160 if( c<0x80
161 || (c&0xFFFFF800)==0xD800
162 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
163 }
drh66150952007-07-23 19:12:41 +0000164 return c;
165}
166
167
168
danielk1977ad76a81e2008-07-29 11:25:14 +0000169
drh66150952007-07-23 19:12:41 +0000170/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000171** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
172** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
173*/
174/* #define TRANSLATE_TRACE 1 */
175
drh6c626082004-11-14 21:56:29 +0000176#ifndef SQLITE_OMIT_UTF16
danielk1977bfd6cce2004-06-18 04:24:54 +0000177/*
178** This routine transforms the internal text encoding used by pMem to
179** desiredEnc. It is an error if the string is already of the desired
180** encoding, or if *pMem does not contain a string value.
181*/
drh4274dae2014-08-24 02:53:23 +0000182SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
drhd4de9f72019-04-14 00:34:20 +0000183 sqlite3_int64 len; /* Maximum length of output string in bytes */
184 unsigned char *zOut; /* Output buffer */
185 unsigned char *zIn; /* Input iterator */
186 unsigned char *zTerm; /* End of input */
187 unsigned char *z; /* Output iterator */
drha39f4c52006-10-04 15:23:21 +0000188 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000189
drhb21c8cd2007-08-21 19:33:56 +0000190 assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
danielk1977bfd6cce2004-06-18 04:24:54 +0000191 assert( pMem->flags&MEM_Str );
192 assert( pMem->enc!=desiredEnc );
193 assert( pMem->enc!=0 );
194 assert( pMem->n>=0 );
195
danielk1977b5402fb2005-01-12 07:15:04 +0000196#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000197 {
drh5ca06322020-01-06 19:23:41 +0000198 StrAccum acc;
199 char zBuf[1000];
200 sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
201 sqlite3VdbeMemPrettyPrint(pMem, &acc);
202 fprintf(stderr, "INPUT: %s\n", sqlite3StrAccumFinish(&acc));
danielk1977ad7dd422004-06-06 12:41:49 +0000203 }
204#endif
205
danielk1977bfd6cce2004-06-18 04:24:54 +0000206 /* If the translation is between UTF-16 little and big endian, then
207 ** all that is required is to swap the byte order. This case is handled
208 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52 +0000209 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000210 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
211 u8 temp;
drh71c697e2004-08-08 23:39:19 +0000212 int rc;
drhb21c8cd2007-08-21 19:33:56 +0000213 rc = sqlite3VdbeMemMakeWriteable(pMem);
drh71c697e2004-08-08 23:39:19 +0000214 if( rc!=SQLITE_OK ){
215 assert( rc==SQLITE_NOMEM );
mistachkinfad30392016-02-13 23:43:46 +0000216 return SQLITE_NOMEM_BKPT;
drh71c697e2004-08-08 23:39:19 +0000217 }
drh2646da72005-12-09 20:02:05 +0000218 zIn = (u8*)pMem->z;
drhbbf695d2008-11-07 03:29:33 +0000219 zTerm = &zIn[pMem->n&~1];
danielk1977bfd6cce2004-06-18 04:24:54 +0000220 while( zIn<zTerm ){
221 temp = *zIn;
222 *zIn = *(zIn+1);
223 zIn++;
224 *zIn++ = temp;
225 }
226 pMem->enc = desiredEnc;
227 goto translate_out;
228 }
229
danielk1977d7e69642004-06-23 00:23:49 +0000230 /* Set len to the maximum number of bytes required in the output buffer. */
231 if( desiredEnc==SQLITE_UTF8 ){
232 /* When converting from UTF-16, the maximum growth results from
drha49b8612006-04-16 12:05:03 +0000233 ** translating a 2-byte character to a 4-byte UTF-8 character.
234 ** A single byte is required for the output string
danielk1977d7e69642004-06-23 00:23:49 +0000235 ** nul-terminator.
236 */
drhbbf695d2008-11-07 03:29:33 +0000237 pMem->n &= ~1;
drhd4de9f72019-04-14 00:34:20 +0000238 len = 2 * (sqlite3_int64)pMem->n + 1;
danielk1977d7e69642004-06-23 00:23:49 +0000239 }else{
240 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
241 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
242 ** character. Two bytes are required in the output buffer for the
243 ** nul-terminator.
244 */
drhd4de9f72019-04-14 00:34:20 +0000245 len = 2 * (sqlite3_int64)pMem->n + 2;
danielk1977d7e69642004-06-23 00:23:49 +0000246 }
247
danielk1977bfd6cce2004-06-18 04:24:54 +0000248 /* Set zIn to point at the start of the input buffer and zTerm to point 1
249 ** byte past the end.
250 **
danielk1977a7a8e142008-02-13 18:25:27 +0000251 ** Variable zOut is set to point at the output buffer, space obtained
252 ** from sqlite3_malloc().
danielk1977bfd6cce2004-06-18 04:24:54 +0000253 */
drh2646da72005-12-09 20:02:05 +0000254 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54 +0000255 zTerm = &zIn[pMem->n];
danielk1977a7a8e142008-02-13 18:25:27 +0000256 zOut = sqlite3DbMallocRaw(pMem->db, len);
257 if( !zOut ){
mistachkinfad30392016-02-13 23:43:46 +0000258 return SQLITE_NOMEM_BKPT;
danielk1977bfd6cce2004-06-18 04:24:54 +0000259 }
260 z = zOut;
261
262 if( pMem->enc==SQLITE_UTF8 ){
263 if( desiredEnc==SQLITE_UTF16LE ){
264 /* UTF-8 -> UTF-16 Little-endian */
265 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000266 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000267 WRITE_UTF16LE(z, c);
268 }
drhb8dd3152004-09-24 23:20:51 +0000269 }else{
270 assert( desiredEnc==SQLITE_UTF16BE );
danielk1977bfd6cce2004-06-18 04:24:54 +0000271 /* UTF-8 -> UTF-16 Big-endian */
272 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14 +0000273 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54 +0000274 WRITE_UTF16BE(z, c);
275 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000276 }
drhea678832008-12-10 19:26:22 +0000277 pMem->n = (int)(z - zOut);
drhb8dd3152004-09-24 23:20:51 +0000278 *z++ = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000279 }else{
280 assert( desiredEnc==SQLITE_UTF8 );
281 if( pMem->enc==SQLITE_UTF16LE ){
282 /* UTF-16 Little-endian -> UTF-8 */
283 while( zIn<zTerm ){
drh0184a252020-02-17 23:08:16 +0000284 c = *(zIn++);
285 c += (*(zIn++))<<8;
286 if( c>=0xd800 && c<0xe000 ){
drh4f1315a2020-05-20 15:02:04 +0000287#ifdef SQLITE_REPLACE_INVALID_UTF
drh0184a252020-02-17 23:08:16 +0000288 if( c>=0xdc00 || zIn>=zTerm ){
289 c = 0xfffd;
290 }else{
291 int c2 = *(zIn++);
292 c2 += (*(zIn++))<<8;
293 if( c2<0xdc00 || c2>=0xe000 ){
294 zIn -= 2;
295 c = 0xfffd;
296 }else{
297 c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
298 }
299 }
drh4f1315a2020-05-20 15:02:04 +0000300#else
301 if( zIn<zTerm ){
302 int c2 = (*zIn++);
303 c2 += ((*zIn++)<<8);
304 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
305 }
306#endif
drh0184a252020-02-17 23:08:16 +0000307 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000308 WRITE_UTF8(z, c);
309 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000310 }else{
mihailim7ffb2b52008-06-27 18:59:44 +0000311 /* UTF-16 Big-endian -> UTF-8 */
danielk1977bfd6cce2004-06-18 04:24:54 +0000312 while( zIn<zTerm ){
drh0184a252020-02-17 23:08:16 +0000313 c = (*(zIn++))<<8;
314 c += *(zIn++);
315 if( c>=0xd800 && c<0xe000 ){
drh4f1315a2020-05-20 15:02:04 +0000316#ifdef SQLITE_REPLACE_INVALID_UTF
drh0184a252020-02-17 23:08:16 +0000317 if( c>=0xdc00 || zIn>=zTerm ){
318 c = 0xfffd;
319 }else{
320 int c2 = (*(zIn++))<<8;
321 c2 += *(zIn++);
322 if( c2<0xdc00 || c2>=0xe000 ){
323 zIn -= 2;
324 c = 0xfffd;
325 }else{
326 c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
327 }
328 }
drh4f1315a2020-05-20 15:02:04 +0000329#else
330 if( zIn<zTerm ){
331 int c2 = ((*zIn++)<<8);
332 c2 += (*zIn++);
333 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
334 }
335#endif
drh0184a252020-02-17 23:08:16 +0000336 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000337 WRITE_UTF8(z, c);
338 }
danielk1977998b56c2004-05-06 23:37:52 +0000339 }
drhaa78bec2008-12-09 03:55:14 +0000340 pMem->n = (int)(z - zOut);
danielk1977998b56c2004-05-06 23:37:52 +0000341 }
drhb8dd3152004-09-24 23:20:51 +0000342 *z = 0;
danielk1977d7e69642004-06-23 00:23:49 +0000343 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52 +0000344
drh21b473d2020-06-04 02:50:47 +0000345 c = MEM_Str|MEM_Term|(pMem->flags&(MEM_AffMask|MEM_Subtype));
danielk1977bfd6cce2004-06-18 04:24:54 +0000346 sqlite3VdbeMemRelease(pMem);
drh21b473d2020-06-04 02:50:47 +0000347 pMem->flags = c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000348 pMem->enc = desiredEnc;
drh2646da72005-12-09 20:02:05 +0000349 pMem->z = (char*)zOut;
danielk19775f096132008-03-28 15:44:09 +0000350 pMem->zMalloc = pMem->z;
drh17bcb102014-09-18 21:25:33 +0000351 pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000352
353translate_out:
danielk1977b5402fb2005-01-12 07:15:04 +0000354#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54 +0000355 {
drh5ca06322020-01-06 19:23:41 +0000356 StrAccum acc;
357 char zBuf[1000];
358 sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
359 sqlite3VdbeMemPrettyPrint(pMem, &acc);
360 fprintf(stderr, "OUTPUT: %s\n", sqlite3StrAccumFinish(&acc));
danielk1977bfd6cce2004-06-18 04:24:54 +0000361 }
362#endif
363 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52 +0000364}
drhf0f44b72017-07-12 12:19:33 +0000365#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000366
drhf0f44b72017-07-12 12:19:33 +0000367#ifndef SQLITE_OMIT_UTF16
danielk197793d46752004-05-23 13:30:58 +0000368/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000369** This routine checks for a byte-order mark at the beginning of the
370** UTF-16 string stored in *pMem. If one is present, it is removed and
371** the encoding of the Mem adjusted. This routine does not do any
372** byte-swapping, it just sets Mem.enc appropriately.
373**
374** The allocation (static, dynamic etc.) and encoding of the Mem may be
375** changed by this function.
danielk197793d46752004-05-23 13:30:58 +0000376*/
drhb21c8cd2007-08-21 19:33:56 +0000377int sqlite3VdbeMemHandleBom(Mem *pMem){
danielk1977bfd6cce2004-06-18 04:24:54 +0000378 int rc = SQLITE_OK;
379 u8 bom = 0;
380
drh769e97e2009-04-01 16:33:37 +0000381 assert( pMem->n>=0 );
382 if( pMem->n>1 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000383 u8 b1 = *(u8 *)pMem->z;
384 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58 +0000385 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000386 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000387 }
388 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000389 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000390 }
391 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000392
393 if( bom ){
danielk1977a7a8e142008-02-13 18:25:27 +0000394 rc = sqlite3VdbeMemMakeWriteable(pMem);
395 if( rc==SQLITE_OK ){
396 pMem->n -= 2;
397 memmove(pMem->z, &pMem->z[2], pMem->n);
398 pMem->z[pMem->n] = '\0';
399 pMem->z[pMem->n+1] = '\0';
400 pMem->flags |= MEM_Term;
401 pMem->enc = bom;
danielk1977bfd6cce2004-06-18 04:24:54 +0000402 }
danielk1977998b56c2004-05-06 23:37:52 +0000403 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000404 return rc;
danielk1977998b56c2004-05-06 23:37:52 +0000405}
drh6c626082004-11-14 21:56:29 +0000406#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52 +0000407
408/*
danielk19776622cce2004-05-20 11:00:52 +0000409** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
410** return the number of unicode characters in pZ up to (but not including)
411** the first 0x00 byte. If nByte is not less than zero, return the
412** number of unicode characters in the first nByte of pZ (or up to
413** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000414*/
drh4a919112007-05-15 11:55:09 +0000415int sqlite3Utf8CharLen(const char *zIn, int nByte){
danielk1977bfd6cce2004-06-18 04:24:54 +0000416 int r = 0;
drh4a919112007-05-15 11:55:09 +0000417 const u8 *z = (const u8*)zIn;
418 const u8 *zTerm;
danielk19771ba1b552004-06-23 13:46:32 +0000419 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54 +0000420 zTerm = &z[nByte];
421 }else{
drh4a919112007-05-15 11:55:09 +0000422 zTerm = (const u8*)(-1);
danielk1977998b56c2004-05-06 23:37:52 +0000423 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000424 assert( z<=zTerm );
425 while( *z!=0 && z<zTerm ){
drh4a919112007-05-15 11:55:09 +0000426 SQLITE_SKIP_UTF8(z);
danielk1977bfd6cce2004-06-18 04:24:54 +0000427 r++;
428 }
429 return r;
danielk19776622cce2004-05-20 11:00:52 +0000430}
431
danielk19774152e672007-09-12 17:01:45 +0000432/* This test function is not currently used by the automated test-suite.
433** Hence it is only available in debug builds.
434*/
435#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
436/*
437** Translate UTF-8 to UTF-8.
438**
439** This has the effect of making sure that the string is well-formed
440** UTF-8. Miscoded characters are removed.
441**
shanehdba2cc42011-03-24 17:43:18 +0000442** The translation is done in-place and aborted if the output
443** overruns the input.
danielk19774152e672007-09-12 17:01:45 +0000444*/
445int sqlite3Utf8To8(unsigned char *zIn){
446 unsigned char *zOut = zIn;
447 unsigned char *zStart = zIn;
danielk19774152e672007-09-12 17:01:45 +0000448 u32 c;
449
shanehdba2cc42011-03-24 17:43:18 +0000450 while( zIn[0] && zOut<=zIn ){
drh42610962012-09-17 18:56:32 +0000451 c = sqlite3Utf8Read((const u8**)&zIn);
danielk19774152e672007-09-12 17:01:45 +0000452 if( c!=0xfffd ){
453 WRITE_UTF8(zOut, c);
454 }
455 }
456 *zOut = 0;
shaneb08a67a2009-03-31 03:41:56 +0000457 return (int)(zOut - zStart);
danielk19774152e672007-09-12 17:01:45 +0000458}
459#endif
460
drh6c626082004-11-14 21:56:29 +0000461#ifndef SQLITE_OMIT_UTF16
danielk19776622cce2004-05-20 11:00:52 +0000462/*
drhaf9a7c22005-12-15 03:04:10 +0000463** Convert a UTF-16 string in the native encoding into a UTF-8 string.
drh17435752007-08-16 04:30:38 +0000464** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
465** be freed by the calling function.
drhaf9a7c22005-12-15 03:04:10 +0000466**
467** NULL is returned if there is an allocation error.
468*/
danb7dca7d2010-03-05 16:32:12 +0000469char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
drhaf9a7c22005-12-15 03:04:10 +0000470 Mem m;
471 memset(&m, 0, sizeof(m));
drhb21c8cd2007-08-21 19:33:56 +0000472 m.db = db;
danb7dca7d2010-03-05 16:32:12 +0000473 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
drhb21c8cd2007-08-21 19:33:56 +0000474 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
danielk1977ae72d982007-10-03 08:46:44 +0000475 if( db->mallocFailed ){
476 sqlite3VdbeMemRelease(&m);
477 m.z = 0;
478 }
drh17435752007-08-16 04:30:38 +0000479 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
480 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
danb7dca7d2010-03-05 16:32:12 +0000481 assert( m.z || db->mallocFailed );
482 return m.z;
drhaf9a7c22005-12-15 03:04:10 +0000483}
484
485/*
drh1faca752009-10-24 03:04:10 +0000486** zIn is a UTF-16 encoded unicode string at least nChar characters long.
drhaed382f2009-04-01 18:40:32 +0000487** Return the number of bytes in the first nChar unicode characters
488** in pZ. nChar must be non-negative.
danielk19776622cce2004-05-20 11:00:52 +0000489*/
drhee858132007-05-08 20:37:38 +0000490int sqlite3Utf16ByteLen(const void *zIn, int nChar){
drhaed382f2009-04-01 18:40:32 +0000491 int c;
492 unsigned char const *z = zIn;
danielk1977bfd6cce2004-06-18 04:24:54 +0000493 int n = 0;
drh6d116ca2009-10-24 01:55:14 +0000494
drh0184a252020-02-17 23:08:16 +0000495 if( SQLITE_UTF16NATIVE==SQLITE_UTF16LE ) z++;
496 while( n<nChar ){
497 c = z[0];
498 z += 2;
499 if( c>=0xd8 && c<0xdc && z[0]>=0xdc && z[0]<0xe0 ) z += 2;
500 n++;
danielk19776622cce2004-05-20 11:00:52 +0000501 }
drh0184a252020-02-17 23:08:16 +0000502 return (int)(z-(unsigned char const *)zIn)
503 - (SQLITE_UTF16NATIVE==SQLITE_UTF16LE);
danielk1977998b56c2004-05-06 23:37:52 +0000504}
505
drh53c14022007-05-10 17:23:11 +0000506#if defined(SQLITE_TEST)
507/*
danielk1977bfd6cce2004-06-18 04:24:54 +0000508** This routine is called from the TCL test function "translate_selftest".
509** It checks that the primitives for serializing and deserializing
510** characters in each encoding are inverses of each other.
511*/
danielk197744a376f2008-08-12 15:04:58 +0000512void sqlite3UtfSelfTest(void){
drhb3fa0e02006-10-19 01:58:43 +0000513 unsigned int i, t;
danielk1977bfd6cce2004-06-18 04:24:54 +0000514 unsigned char zBuf[20];
515 unsigned char *z;
516 int n;
drha39f4c52006-10-04 15:23:21 +0000517 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54 +0000518
danielk19771ba1b552004-06-23 13:46:32 +0000519 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54 +0000520 z = zBuf;
521 WRITE_UTF8(z, i);
shane18e526c2008-12-10 22:30:24 +0000522 n = (int)(z-zBuf);
523 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09 +0000524 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54 +0000525 z = zBuf;
drh42610962012-09-17 18:56:32 +0000526 c = sqlite3Utf8Read((const u8**)&z);
drhb3fa0e02006-10-19 01:58:43 +0000527 t = i;
528 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
529 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
530 assert( c==t );
danielk1977bfd6cce2004-06-18 04:24:54 +0000531 assert( (z-zBuf)==n );
532 }
danielk1977bfd6cce2004-06-18 04:24:54 +0000533}
drh6c626082004-11-14 21:56:29 +0000534#endif /* SQLITE_TEST */
535#endif /* SQLITE_OMIT_UTF16 */