blob: 5ee017c0dc0034e7e8fc5a897fc08de63c20fc5c [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
danielk1977dc8453f2004-06-12 00:42:34 +000015** $Id: utf.c,v 1.19 2004/06/12 00:42:35 danielk1977 Exp $
drha5d14fe2004-05-04 15:00:46 +000016**
17** Notes on UTF-8:
18**
19** Byte-0 Byte-1 Byte-2 Byte-3 Value
20** 0xxxxxxx 00000000 00000000 0xxxxxxx
21** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24**
25**
26** Notes on UTF-16: (with wwww+1==uuuuu)
27**
drh51846b52004-05-28 16:00:21 +000028** Word-0 Word-1 Value
29** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
30** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:46 +000031**
danielk1977998b56c2004-05-06 23:37:52 +000032**
drha5d14fe2004-05-04 15:00:46 +000033** BOM or Byte Order Mark:
34** 0xff 0xfe little-endian utf-16 follows
35** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000036**
37**
38** Handling of malformed strings:
39**
40** SQLite accepts and processes malformed strings without an error wherever
41** possible. However this is not possible when converting between UTF-8 and
42** UTF-16.
43**
44** When converting malformed UTF-8 strings to UTF-16, one instance of the
45** replacement character U+FFFD for each byte that cannot be interpeted as
46** part of a valid unicode character.
47**
48** When converting malformed UTF-16 strings to UTF-8, one instance of the
49** replacement character U+FFFD for each pair of bytes that cannot be
50** interpeted as part of a valid unicode character.
drha5d14fe2004-05-04 15:00:46 +000051*/
danielk1977998b56c2004-05-06 23:37:52 +000052#include <assert.h>
danielk1977998b56c2004-05-06 23:37:52 +000053#include "sqliteInt.h"
54
55typedef struct UtfString UtfString;
56struct UtfString {
57 unsigned char *pZ; /* Raw string data */
58 int n; /* Allocated length of pZ in bytes */
59 int c; /* Number of pZ bytes already read or written */
60};
61
danielk1977998b56c2004-05-06 23:37:52 +000062/*
63** These two macros are used to interpret the first two bytes of the
64** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
65** interpretation, LE16() for little-endian.
66*/
67#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
68#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
69
70/*
71** READ_16 interprets the first two bytes of the unsigned char array pZ
72** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
73** is big-endian, otherwise little-endian.
74*/
75#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
76
77/*
danielk1977d02eb1f2004-06-06 09:44:03 +000078** The following macro, LOWERCASE(x), takes an integer representing a
79** unicode code point. The value returned is the same code point folded to
80** lower case, if applicable. SQLite currently understands the upper/lower
81** case relationship between the 26 characters used in the English
82** language only.
83**
84** This means that characters with umlauts etc. will not be folded
85** correctly (unless they are encoded as composite characters, which would
86** doubtless cause much trouble).
87*/
88#define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x);
89static unsigned char UpperToLower[91] = {
90 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
91 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
92 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
93 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
94 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
95 122,
96};
97
98/*
99** The first parameter, zStr, points at a unicode string. This routine
100** reads a single character from the string and returns the codepoint value
101** of the character read.
102**
danielk1977dc8453f2004-06-12 00:42:34 +0000103** The value of *pEnc is the string encoding. If *pEnc is SQLITE_UTF16LE or
104** SQLITE_UTF16BE, and the first character read is a byte-order-mark, then
danielk1977d02eb1f2004-06-06 09:44:03 +0000105** the value of *pEnc is modified if necessary. In this case the next
106** character is read and it's code-point value returned.
107**
108** The value of *pOffset is the byte-offset in zStr from which to begin
109** reading. It is incremented by the number of bytes read by this function.
110**
111** If the fourth parameter, fold, is non-zero, then codepoint values are
112** folded to lower-case before being returned. See comments for macro
113** LOWERCASE(x) for details.
114*/
115int sqlite3ReadUniChar(const char *zStr, int *pOffset, u8 *pEnc, int fold){
116 int ret = 0;
117
118 switch( *pEnc ){
danielk1977dc8453f2004-06-12 00:42:34 +0000119 case SQLITE_UTF8: {
danielk1977ad7dd422004-06-06 12:41:49 +0000120
121#if 0
122 static const int initVal[] = {
123 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
124 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
125 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
126 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
127 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
128 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
129 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
130 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
131 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
132 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
133 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
134 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
135 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2,
136 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
137 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
138 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
139 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 254,
140 255,
141 };
142 ret = initVal[(unsigned char)zStr[(*pOffset)++]];
143 while( (0xc0&zStr[*pOffset])==0x80 ){
144 ret = (ret<<6) | (0x3f&(zStr[(*pOffset)++]));
145 }
146#endif
147
danielk1977d02eb1f2004-06-06 09:44:03 +0000148 struct Utf8TblRow {
149 u8 b1_mask;
150 u8 b1_masked_val;
151 u8 b1_value_mask;
152 int trailing_bytes;
153 };
154 static const struct Utf8TblRow utf8tbl[] = {
155 { 0x80, 0x00, 0x7F, 0 },
156 { 0xE0, 0xC0, 0x1F, 1 },
157 { 0xF0, 0xE0, 0x0F, 2 },
158 { 0xF8, 0xF0, 0x0E, 3 },
159 { 0, 0, 0, 0}
160 };
161
162 u8 b1; /* First byte of the potentially multi-byte utf-8 character */
163 int ii;
164 struct Utf8TblRow const *pRow;
165
166 pRow = &(utf8tbl[0]);
167
168 b1 = zStr[(*pOffset)++];
169 while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
170 pRow++;
171 }
172 if( !pRow->b1_mask ){
173 return (int)0xFFFD;
174 }
175
176 ret = (u32)(b1&pRow->b1_value_mask);
177 for( ii=0; ii<pRow->trailing_bytes; ii++ ){
178 u8 b = zStr[(*pOffset)++];
179 if( (b&0xC0)!=0x80 ){
180 return (int)0xFFFD;
181 }
182 ret = (ret<<6) + (u32)(b&0x3F);
183 }
danielk1977d02eb1f2004-06-06 09:44:03 +0000184 break;
185 }
186
danielk1977dc8453f2004-06-12 00:42:34 +0000187 case SQLITE_UTF16LE:
188 case SQLITE_UTF16BE: {
danielk1977d02eb1f2004-06-06 09:44:03 +0000189 u32 code_point; /* the first code-point in the character */
190 u32 code_point2; /* the second code-point in the character, if any */
191
danielk1977dc8453f2004-06-12 00:42:34 +0000192 code_point = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE));
danielk1977d02eb1f2004-06-06 09:44:03 +0000193 *pOffset += 2;
194
195 /* If this is a non-surrogate code-point, just cast it to an int and
196 ** this is the code-point value.
197 */
198 if( code_point<0xD800 || code_point>0xE000 ){
199 ret = code_point;
200 break;
201 }
202
203 /* If this is a trailing surrogate code-point, then the string is
204 ** malformed; return the replacement character.
205 */
206 if( code_point>0xDBFF ){
207 return (int)0xFFFD;
208 }
209
210 /* The code-point just read is a leading surrogate code-point. If their
211 ** is not enough data left or the next code-point is not a trailing
212 ** surrogate, return the replacement character.
213 */
danielk1977dc8453f2004-06-12 00:42:34 +0000214 code_point2 = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE));
danielk1977d02eb1f2004-06-06 09:44:03 +0000215 *pOffset += 2;
216 if( code_point2<0xDC00 || code_point>0xDFFF ){
217 return (int)0xFFFD;
218 }
219
220 ret = (
221 (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
222 ((code_point&0x003F)<<10) + /* xxxxxx */
223 (code_point2&0x03FF) /* yy yyyyyyyy */
224 );
225 }
226 default:
227 assert(0);
228 }
229
230 if( fold ){
231 return LOWERCASE(ret);
232 }
233 return ret;
234}
235
236/*
danielk1977998b56c2004-05-06 23:37:52 +0000237** Read the BOM from the start of *pStr, if one is present. Return zero
238** for little-endian, non-zero for big-endian. If no BOM is present, return
danielk1977b1bc9532004-05-22 03:05:33 +0000239** the value of the parameter "big_endian".
danielk1977998b56c2004-05-06 23:37:52 +0000240**
241** Return values:
242** 1 -> big-endian string
243** 0 -> little-endian string
244*/
danielk1977b1bc9532004-05-22 03:05:33 +0000245static int readUtf16Bom(UtfString *pStr, int big_endian){
danielk1977998b56c2004-05-06 23:37:52 +0000246 /* The BOM must be the first thing read from the string */
247 assert( pStr->c==0 );
248
249 /* If the string data consists of 1 byte or less, the BOM will make no
250 ** difference anyway. In this case just fall through to the default case
251 ** and return the native byte-order for this machine.
252 **
253 ** Otherwise, check the first 2 bytes of the string to see if a BOM is
254 ** present.
255 */
256 if( pStr->n>1 ){
danielk1977193c72f2004-06-02 00:29:24 +0000257 u8 bom = sqlite3UtfReadBom(pStr->pZ, 2);
258 if( bom ){
259 pStr->c += 2;
danielk1977dc8453f2004-06-12 00:42:34 +0000260 return (bom==SQLITE_UTF16LE)?0:1;
danielk1977998b56c2004-05-06 23:37:52 +0000261 }
262 }
263
danielk1977b1bc9532004-05-22 03:05:33 +0000264 return big_endian;
danielk1977998b56c2004-05-06 23:37:52 +0000265}
266
danielk197793d46752004-05-23 13:30:58 +0000267/*
268** zData is a UTF-16 encoded string, nData bytes in length. This routine
269** checks if there is a byte-order mark at the start of zData. If no
danielk1977dc8453f2004-06-12 00:42:34 +0000270** byte order mark is found 0 is returned. Otherwise SQLITE_UTF16BE or
271** SQLITE_UTF16LE is returned, depending on whether The BOM indicates that
danielk197793d46752004-05-23 13:30:58 +0000272** the text is big-endian or little-endian.
273*/
274u8 sqlite3UtfReadBom(const void *zData, int nData){
275 if( nData<0 || nData>1 ){
276 u8 b1 = *(u8 *)zData;
277 u8 b2 = *(((u8 *)zData) + 1);
278 if( b1==0xFE && b2==0xFF ){
danielk1977dc8453f2004-06-12 00:42:34 +0000279 return SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58 +0000280 }
281 if( b1==0xFF && b2==0xFE ){
danielk1977dc8453f2004-06-12 00:42:34 +0000282 return SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58 +0000283 }
284 }
285 return 0;
286}
287
danielk1977998b56c2004-05-06 23:37:52 +0000288
289/*
290** Read a single unicode character from the UTF-8 encoded string *pStr. The
291** value returned is a unicode scalar value. In the case of malformed
292** strings, the unicode replacement character U+FFFD may be returned.
293*/
294static u32 readUtf8(UtfString *pStr){
danielk1977dc8453f2004-06-12 00:42:34 +0000295 u8 enc = SQLITE_UTF8;
danielk1977d02eb1f2004-06-06 09:44:03 +0000296 return sqlite3ReadUniChar(pStr->pZ, &pStr->c, &enc, 0);
danielk1977998b56c2004-05-06 23:37:52 +0000297}
298
299/*
300** Write the unicode character 'code' to the string pStr using UTF-8
301** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
302*/
303static int writeUtf8(UtfString *pStr, u32 code){
304 struct Utf8WriteTblRow {
305 u32 max_code;
306 int trailing_bytes;
307 u8 b1_and_mask;
308 u8 b1_or_mask;
309 };
310 static const struct Utf8WriteTblRow utf8tbl[] = {
311 {0x0000007F, 0, 0x7F, 0x00},
312 {0x000007FF, 1, 0xDF, 0xC0},
313 {0x0000FFFF, 2, 0xEF, 0xE0},
314 {0x0010FFFF, 3, 0xF7, 0xF0},
315 {0x00000000, 0, 0x00, 0x00}
316 };
danielk19776622cce2004-05-20 11:00:52 +0000317 const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
danielk1977998b56c2004-05-06 23:37:52 +0000318
danielk1977295ba552004-05-19 10:34:51 +0000319 while( code>pRow->max_code ){
danielk1977998b56c2004-05-06 23:37:52 +0000320 assert( pRow->max_code );
321 pRow++;
322 }
323
324 /* Ensure there is enough room left in the output buffer to write
325 ** this UTF-8 character.
326 */
327 assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
328
329 /* Write the UTF-8 encoded character to pStr. All cases below are
330 ** intentionally fall-through.
331 */
332 switch( pRow->trailing_bytes ){
333 case 3:
334 pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80;
335 code = code>>6;
336 case 2:
337 pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80;
338 code = code>>6;
339 case 1:
340 pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80;
341 code = code>>6;
342 case 0:
343 pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask);
344 }
345 pStr->c += (pRow->trailing_bytes + 1);
346
347 return 0;
348}
349
350/*
351** Read a single unicode character from the UTF-16 encoded string *pStr. The
352** value returned is a unicode scalar value. In the case of malformed
353** strings, the unicode replacement character U+FFFD may be returned.
354**
355** If big_endian is true, the string is assumed to be UTF-16BE encoded.
356** Otherwise, it is UTF-16LE encoded.
357*/
358static u32 readUtf16(UtfString *pStr, int big_endian){
359 u32 code_point; /* the first code-point in the character */
360
361 /* If there is only one byte of data left in the string, return the
362 ** replacement character.
363 */
364 if( (pStr->n-pStr->c)==1 ){
365 pStr->c++;
366 return (int)0xFFFD;
367 }
368
369 code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
370 pStr->c += 2;
371
372 /* If this is a non-surrogate code-point, just cast it to an int and
373 ** return the code-point value.
374 */
375 if( code_point<0xD800 || code_point>0xE000 ){
376 return code_point;
377 }
378
379 /* If this is a trailing surrogate code-point, then the string is
380 ** malformed; return the replacement character.
381 */
382 if( code_point>0xDBFF ){
383 return 0xFFFD;
384 }
385
386 /* The code-point just read is a leading surrogate code-point. If their
387 ** is not enough data left or the next code-point is not a trailing
388 ** surrogate, return the replacement character.
389 */
390 if( (pStr->n-pStr->c)>1 ){
391 u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
392 if( code_point2<0xDC00 || code_point>0xDFFF ){
393 return 0xFFFD;
394 }
395 pStr->c += 2;
396
397 return (
398 (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
399 ((code_point&0x003F)<<10) + /* xxxxxx */
400 (code_point2&0x03FF) /* yy yyyyyyyy */
401 );
402
403 }else{
404 return (int)0xFFFD;
405 }
406
407 /* not reached */
408}
409
410static int writeUtf16(UtfString *pStr, int code, int big_endian){
411 int bytes;
412 unsigned char *hi_byte;
413 unsigned char *lo_byte;
414
415 bytes = (code>0x0000FFFF?4:2);
416
417 /* Ensure there is enough room left in the output buffer to write
418 ** this UTF-8 character.
419 */
420 assert( (pStr->n-pStr->c)>=bytes );
421
422 /* Initialise hi_byte and lo_byte to point at the locations into which
423 ** the MSB and LSB of the (first) 16-bit unicode code-point written for
424 ** this character.
425 */
426 hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
427 lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
428
429 if( bytes==2 ){
430 *hi_byte = (u8)((code&0x0000FF00)>>8);
431 *lo_byte = (u8)(code&0x000000FF);
432 }else{
433 u32 wrd;
434 wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800;
435 *hi_byte = (u8)((wrd&0x0000FF00)>>8);
436 *lo_byte = (u8)(wrd&0x000000FF);
437
438 wrd = (code&0x000003FF)|0x0000DC00;
439 *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
440 *(lo_byte+2) = (u8)(wrd&0x000000FF);
441 }
442
443 pStr->c += bytes;
444
445 return 0;
446}
447
448/*
danielk19776622cce2004-05-20 11:00:52 +0000449** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
450** return the number of unicode characters in pZ up to (but not including)
451** the first 0x00 byte. If nByte is not less than zero, return the
452** number of unicode characters in the first nByte of pZ (or up to
453** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52 +0000454*/
danielk19776622cce2004-05-20 11:00:52 +0000455int sqlite3utf8CharLen(const char *pZ, int nByte){
456 UtfString str;
457 int ret = 0;
458 u32 code = 1;
459
460 str.pZ = (char *)pZ;
461 str.n = nByte;
462 str.c = 0;
463
464 while( (nByte<0 || str.c<str.n) && code!=0 ){
465 code = readUtf8(&str);
466 ret++;
danielk1977998b56c2004-05-06 23:37:52 +0000467 }
danielk19776622cce2004-05-20 11:00:52 +0000468 if( code==0 ) ret--;
469
470 return ret;
471}
472
473/*
474** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
475** return the number of bytes up to (but not including), the first pair
476** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
477** then return the number of bytes in the first nChar unicode characters
478** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
479*/
480int sqlite3utf16ByteLen(const void *pZ, int nChar){
481 if( nChar<0 ){
danielk1977e7d00f52004-05-29 02:44:02 +0000482 const unsigned char *pC1 = (unsigned char *)pZ;
483 const unsigned char *pC2 = (unsigned char *)pZ+1;
danielk19776622cce2004-05-20 11:00:52 +0000484 while( *pC1 || *pC2 ){
485 pC1 += 2;
486 pC2 += 2;
487 }
488 return pC1-(unsigned char *)pZ;
489 }else{
490 UtfString str;
491 u32 code = 1;
492 int big_endian;
493 int nRead = 0;
494 int ret;
495
496 str.pZ = (char *)pZ;
497 str.c = 0;
498 str.n = -1;
499
danielk1977b1bc9532004-05-22 03:05:33 +0000500 /* Check for a BOM. We just ignore it if there is one, it's only read
501 ** so that it is not counted as a character.
502 */
503 big_endian = readUtf16Bom(&str, 0);
danielk19776622cce2004-05-20 11:00:52 +0000504 ret = 0-str.c;
505
506 while( code!=0 && nRead<nChar ){
507 code = readUtf16(&str, big_endian);
508 nRead++;
509 }
510 if( code==0 ){
511 ret -= 2;
512 }
513 return str.c + ret;
514 }
danielk1977998b56c2004-05-06 23:37:52 +0000515}
516
drha5d14fe2004-05-04 15:00:46 +0000517/*
518** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
519** "BOM") into a UTF-8 string. The UTF-8 string is written into space
danielk1977998b56c2004-05-06 23:37:52 +0000520** obtained from sqlite3Malloc() and must be released by the calling function.
drha5d14fe2004-05-04 15:00:46 +0000521**
522** The parameter N is the number of bytes in the UTF-16 string. If N is
523** negative, the entire string up to the first \u0000 character is translated.
524**
525** The returned UTF-8 string is always \000 terminated.
526*/
danielk1977b1bc9532004-05-22 03:05:33 +0000527unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian){
danielk1977998b56c2004-05-06 23:37:52 +0000528 UtfString in;
529 UtfString out;
danielk1977998b56c2004-05-06 23:37:52 +0000530
531 out.pZ = 0;
532
533 in.pZ = (unsigned char *)pData;
534 in.n = N;
535 in.c = 0;
536
537 if( in.n<0 ){
danielk19776622cce2004-05-20 11:00:52 +0000538 in.n = sqlite3utf16ByteLen(in.pZ, -1);
danielk1977998b56c2004-05-06 23:37:52 +0000539 }
540
541 /* A UTF-8 encoding of a unicode string can require at most 1.5 times as
542 ** much space to store as the same string encoded using UTF-16. Allocate
543 ** this now.
544 */
545 out.n = (in.n*1.5) + 1;
danielk1977295ba552004-05-19 10:34:51 +0000546 out.pZ = sqliteMalloc(out.n);
danielk1977998b56c2004-05-06 23:37:52 +0000547 if( !out.pZ ){
548 return 0;
549 }
550 out.c = 0;
551
danielk1977b1bc9532004-05-22 03:05:33 +0000552 big_endian = readUtf16Bom(&in, big_endian);
danielk1977998b56c2004-05-06 23:37:52 +0000553 while( in.c<in.n ){
554 writeUtf8(&out, readUtf16(&in, big_endian));
555 }
556
557 /* Add the NULL-terminator character */
558 assert( out.c<out.n );
559 out.pZ[out.c] = 0x00;
560
561 return out.pZ;
562}
563
564static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){
565 UtfString in;
566 UtfString out;
567
568 in.pZ = (unsigned char *)pIn;
569 in.n = N;
570 in.c = 0;
571
572 if( in.n<0 ){
573 in.n = strlen(in.pZ);
574 }
575
576 /* A UTF-16 encoding of a unicode string can require at most twice as
577 ** much space to store as the same string encoded using UTF-8. Allocate
578 ** this now.
579 */
580 out.n = (in.n*2) + 2;
danielk1977295ba552004-05-19 10:34:51 +0000581 out.pZ = sqliteMalloc(out.n);
danielk1977998b56c2004-05-06 23:37:52 +0000582 if( !out.pZ ){
583 return 0;
584 }
585 out.c = 0;
586
587 while( in.c<in.n ){
588 writeUtf16(&out, readUtf8(&in), big_endian);
589 }
590
591 /* Add the NULL-terminator character */
592 assert( (out.c+1)<out.n );
593 out.pZ[out.c] = 0x00;
594 out.pZ[out.c+1] = 0x00;
595
596 return out.pZ;
597}
598
599/*
600** Translate UTF-8 to UTF-16BE or UTF-16LE
601*/
602void *sqlite3utf8to16be(const unsigned char *pIn, int N){
603 return utf8toUtf16(pIn, N, 1);
604}
605
606void *sqlite3utf8to16le(const unsigned char *pIn, int N){
607 return utf8toUtf16(pIn, N, 0);
608}
609
610/*
611** This routine does the work for sqlite3utf16to16le() and
612** sqlite3utf16to16be(). If big_endian is 1 the input string is
613** transformed in place to UTF-16BE encoding. If big_endian is 0 then
614** the input is transformed to UTF-16LE.
615**
616** Unless the first two bytes of the input string is a BOM, the input is
617** assumed to be UTF-16 encoded using the machines native byte ordering.
618*/
619static void utf16to16(void *pData, int N, int big_endian){
620 UtfString inout;
621 inout.pZ = (unsigned char *)pData;
622 inout.c = 0;
623 inout.n = N;
624
625 if( inout.n<0 ){
danielk19776622cce2004-05-20 11:00:52 +0000626 inout.n = sqlite3utf16ByteLen(inout.pZ, -1);
danielk1977998b56c2004-05-06 23:37:52 +0000627 }
628
drh9c054832004-05-31 18:51:57 +0000629 if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){
danielk1977295ba552004-05-19 10:34:51 +0000630 /* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */
631 int i;
632 for(i=0; i<(inout.n-inout.c); i += 2){
633 char c1 = inout.pZ[i+inout.c];
634 char c2 = inout.pZ[i+inout.c+1];
635 inout.pZ[i] = c2;
636 inout.pZ[i+1] = c1;
637 }
danielk1977998b56c2004-05-06 23:37:52 +0000638 }else if( inout.c ){
639 memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
640 }
danielk1977295ba552004-05-19 10:34:51 +0000641
642 inout.pZ[inout.n-inout.c] = 0x00;
643 inout.pZ[inout.n-inout.c+1] = 0x00;
drha5d14fe2004-05-04 15:00:46 +0000644}
645
646/*
647** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
648** string. The conversion occurs in-place. The output overwrites the
649** input. N bytes are converted. If N is negative everything is converted
650** up to the first \u0000 character.
651**
652** If the native byte order is little-endian and there is no BOM, then
653** this routine is a no-op. If there is a BOM at the start of the string,
654** it is removed.
danielk1977998b56c2004-05-06 23:37:52 +0000655**
drha5d14fe2004-05-04 15:00:46 +0000656** Translation from UTF-16LE to UTF-16BE and back again is accomplished
657** using the library function swab().
658*/
danielk1977998b56c2004-05-06 23:37:52 +0000659void sqlite3utf16to16le(void *pData, int N){
660 utf16to16(pData, N, 0);
661}
drha5d14fe2004-05-04 15:00:46 +0000662
663/*
danielk1977998b56c2004-05-06 23:37:52 +0000664** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
665** string. The conversion occurs in-place. The output overwrites the
666** input. N bytes are converted. If N is negative everything is converted
667** up to the first \u0000 character.
668**
669** If the native byte order is little-endian and there is no BOM, then
670** this routine is a no-op. If there is a BOM at the start of the string,
671** it is removed.
672**
673** Translation from UTF-16LE to UTF-16BE and back again is accomplished
674** using the library function swab().
drha5d14fe2004-05-04 15:00:46 +0000675*/
danielk1977998b56c2004-05-06 23:37:52 +0000676void sqlite3utf16to16be(void *pData, int N){
677 utf16to16(pData, N, 1);
drha5d14fe2004-05-04 15:00:46 +0000678}
danielk1977998b56c2004-05-06 23:37:52 +0000679
danielk1977b1bc9532004-05-22 03:05:33 +0000680/*
681** This function is used to translate between UTF-8 and UTF-16. The
682** result is returned in dynamically allocated memory.
683*/
684int sqlite3utfTranslate(
drheb2e1762004-05-27 01:53:56 +0000685 const void *zData, int nData, /* Input string */
686 u8 enc1, /* Encoding of zData */
687 void **zOut, int *nOut, /* Output string */
688 u8 enc2 /* Desired encoding of output */
danielk1977b1bc9532004-05-22 03:05:33 +0000689){
danielk1977dc8453f2004-06-12 00:42:34 +0000690 assert( enc1==SQLITE_UTF8 || enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE );
691 assert( enc2==SQLITE_UTF8 || enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE );
danielk1977b1bc9532004-05-22 03:05:33 +0000692 assert(
danielk1977dc8453f2004-06-12 00:42:34 +0000693 (enc1==SQLITE_UTF8 && (enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE)) ||
694 (enc2==SQLITE_UTF8 && (enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE))
danielk1977b1bc9532004-05-22 03:05:33 +0000695 );
danielk19774adee202004-05-08 08:23:19 +0000696
danielk1977dc8453f2004-06-12 00:42:34 +0000697 if( enc1==SQLITE_UTF8 ){
698 if( enc2==SQLITE_UTF16LE ){
danielk1977b1bc9532004-05-22 03:05:33 +0000699 *zOut = sqlite3utf8to16le(zData, nData);
700 }else{
701 *zOut = sqlite3utf8to16be(zData, nData);
702 }
703 if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977c572ef72004-05-27 09:28:41 +0000704 *nOut = sqlite3utf16ByteLen(*zOut, -1);
danielk1977b1bc9532004-05-22 03:05:33 +0000705 }else{
danielk1977dc8453f2004-06-12 00:42:34 +0000706 *zOut = sqlite3utf16to8(zData, nData, enc1==SQLITE_UTF16BE);
danielk1977b1bc9532004-05-22 03:05:33 +0000707 if( !(*zOut) ) return SQLITE_NOMEM;
danielk1977c572ef72004-05-27 09:28:41 +0000708 *nOut = strlen(*zOut);
danielk1977b1bc9532004-05-22 03:05:33 +0000709 }
710 return SQLITE_OK;
711}