blob: d9c7e96d7ab6ed2c7c9461a3870d11af891ef404 [file] [log] [blame]
drha5d14fe2004-05-04 15:00:46 +00001/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
danielk1977998b56c2004-05-06 23:37:52 +000015** $Id: utf.c,v 1.2 2004/05/06 23:37:53 danielk1977 Exp $
drha5d14fe2004-05-04 15:00:46 +000016**
17** Notes on UTF-8:
18**
19** Byte-0 Byte-1 Byte-2 Byte-3 Value
20** 0xxxxxxx 00000000 00000000 0xxxxxxx
21** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24**
25**
26** Notes on UTF-16: (with wwww+1==uuuuu)
27**
28** Word-0 Word-1 Value
29** 110110wwwwxxxxxx 110111yyyyyyyyyy 000uuuuu xxxxxxyy yyyyyyyy
30** xxxxxxxxyyyyyyyy 00000000 xxxxxxxx yyyyyyyy
31**
danielk1977998b56c2004-05-06 23:37:52 +000032**
drha5d14fe2004-05-04 15:00:46 +000033** BOM or Byte Order Mark:
34** 0xff 0xfe little-endian utf-16 follows
35** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:52 +000036**
37**
38** Handling of malformed strings:
39**
40** SQLite accepts and processes malformed strings without an error wherever
41** possible. However this is not possible when converting between UTF-8 and
42** UTF-16.
43**
44** When converting malformed UTF-8 strings to UTF-16, one instance of the
45** replacement character U+FFFD for each byte that cannot be interpeted as
46** part of a valid unicode character.
47**
48** When converting malformed UTF-16 strings to UTF-8, one instance of the
49** replacement character U+FFFD for each pair of bytes that cannot be
50** interpeted as part of a valid unicode character.
drha5d14fe2004-05-04 15:00:46 +000051*/
52
danielk1977998b56c2004-05-06 23:37:52 +000053#include <assert.h>
54#include <unistd.h>
55#include "sqliteInt.h"
56
57typedef struct UtfString UtfString;
58struct UtfString {
59 unsigned char *pZ; /* Raw string data */
60 int n; /* Allocated length of pZ in bytes */
61 int c; /* Number of pZ bytes already read or written */
62};
63
64/* TODO: Implement this macro in os.h. It should be 1 on big-endian
65** machines, and 0 on little-endian.
66*/
67#define SQLITE3_NATIVE_BIGENDIAN 0
68
69#if SQLITE3_NATIVE_BIGENDIAN == 1
70#define BOM_BIGENDIAN 0x0000FFFE
71#define BOM_LITTLEENDIAN 0x0000FEFF
72#else
73#define BOM_BIGENDIAN 0x0000FEFF
74#define BOM_LITTLEENDIAN 0x0000FFFE
75#endif
76
77/*
78** These two macros are used to interpret the first two bytes of the
79** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
80** interpretation, LE16() for little-endian.
81*/
82#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
83#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
84
85/*
86** READ_16 interprets the first two bytes of the unsigned char array pZ
87** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
88** is big-endian, otherwise little-endian.
89*/
90#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
91
92/*
93** Read the BOM from the start of *pStr, if one is present. Return zero
94** for little-endian, non-zero for big-endian. If no BOM is present, return
95** the machines native byte order.
96**
97** Return values:
98** 1 -> big-endian string
99** 0 -> little-endian string
100*/
101static int readUtf16Bom(UtfString *pStr){
102 /* The BOM must be the first thing read from the string */
103 assert( pStr->c==0 );
104
105 /* If the string data consists of 1 byte or less, the BOM will make no
106 ** difference anyway. In this case just fall through to the default case
107 ** and return the native byte-order for this machine.
108 **
109 ** Otherwise, check the first 2 bytes of the string to see if a BOM is
110 ** present.
111 */
112 if( pStr->n>1 ){
113 u32 bom = BE16(pStr->pZ);
114 if( bom==BOM_BIGENDIAN ){
115 pStr->c = 2;
116 return 1;
117 }
118 if( bom==BOM_LITTLEENDIAN ){
119 pStr->c = 2;
120 return 0;
121 }
122 }
123
124 return SQLITE3_NATIVE_BIGENDIAN;
125}
126
127
128/*
129** Read a single unicode character from the UTF-8 encoded string *pStr. The
130** value returned is a unicode scalar value. In the case of malformed
131** strings, the unicode replacement character U+FFFD may be returned.
132*/
133static u32 readUtf8(UtfString *pStr){
134 struct Utf8TblRow {
135 u8 b1_mask;
136 u8 b1_masked_val;
137 u8 b1_value_mask;
138 int trailing_bytes;
139 };
140 static const struct Utf8TblRow utf8tbl[] = {
141 { 0x80, 0x00, 0x7F, 0 },
142 { 0xE0, 0xC0, 0x1F, 1 },
143 { 0xF0, 0xE0, 0x0F, 2 },
144 { 0xF8, 0xF0, 0x0E, 3 },
145 { 0, 0, 0, 0}
146 };
147
148 u8 b1; /* First byte of the potentially multi-byte utf-8 character */
149 u32 ret = 0; /* Return value */
150 int ii;
151 struct Utf8TblRow const *pRow;
152
153 pRow = &(utf8tbl[0]);
154
155 b1 = pStr->pZ[pStr->c];
156 pStr->c++;
157 while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
158 pRow++;
159 }
160 if( !pRow->b1_mask ){
161 return 0xFFFD;
162 }
163
164 ret = (u32)(b1&pRow->b1_value_mask);
165 for( ii=0; ii<pRow->trailing_bytes; ii++ ){
166 u8 b = pStr->pZ[pStr->c+ii];
167 if( (b&0xC0)!=0x80 ){
168 return 0xFFFD;
169 }
170 ret = (ret<<6) + (u32)(b&0x3F);
171 }
172
173 pStr->c += pRow->trailing_bytes;
174 return ret;
175}
176
177/*
178** Write the unicode character 'code' to the string pStr using UTF-8
179** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
180*/
181static int writeUtf8(UtfString *pStr, u32 code){
182 struct Utf8WriteTblRow {
183 u32 max_code;
184 int trailing_bytes;
185 u8 b1_and_mask;
186 u8 b1_or_mask;
187 };
188 static const struct Utf8WriteTblRow utf8tbl[] = {
189 {0x0000007F, 0, 0x7F, 0x00},
190 {0x000007FF, 1, 0xDF, 0xC0},
191 {0x0000FFFF, 2, 0xEF, 0xE0},
192 {0x0010FFFF, 3, 0xF7, 0xF0},
193 {0x00000000, 0, 0x00, 0x00}
194 };
195 static const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
196
197 while( code<=pRow->max_code ){
198 assert( pRow->max_code );
199 pRow++;
200 }
201
202 /* Ensure there is enough room left in the output buffer to write
203 ** this UTF-8 character.
204 */
205 assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
206
207 /* Write the UTF-8 encoded character to pStr. All cases below are
208 ** intentionally fall-through.
209 */
210 switch( pRow->trailing_bytes ){
211 case 3:
212 pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80;
213 code = code>>6;
214 case 2:
215 pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80;
216 code = code>>6;
217 case 1:
218 pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80;
219 code = code>>6;
220 case 0:
221 pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask);
222 }
223 pStr->c += (pRow->trailing_bytes + 1);
224
225 return 0;
226}
227
228/*
229** Read a single unicode character from the UTF-16 encoded string *pStr. The
230** value returned is a unicode scalar value. In the case of malformed
231** strings, the unicode replacement character U+FFFD may be returned.
232**
233** If big_endian is true, the string is assumed to be UTF-16BE encoded.
234** Otherwise, it is UTF-16LE encoded.
235*/
236static u32 readUtf16(UtfString *pStr, int big_endian){
237 u32 code_point; /* the first code-point in the character */
238
239 /* If there is only one byte of data left in the string, return the
240 ** replacement character.
241 */
242 if( (pStr->n-pStr->c)==1 ){
243 pStr->c++;
244 return (int)0xFFFD;
245 }
246
247 code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
248 pStr->c += 2;
249
250 /* If this is a non-surrogate code-point, just cast it to an int and
251 ** return the code-point value.
252 */
253 if( code_point<0xD800 || code_point>0xE000 ){
254 return code_point;
255 }
256
257 /* If this is a trailing surrogate code-point, then the string is
258 ** malformed; return the replacement character.
259 */
260 if( code_point>0xDBFF ){
261 return 0xFFFD;
262 }
263
264 /* The code-point just read is a leading surrogate code-point. If their
265 ** is not enough data left or the next code-point is not a trailing
266 ** surrogate, return the replacement character.
267 */
268 if( (pStr->n-pStr->c)>1 ){
269 u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
270 if( code_point2<0xDC00 || code_point>0xDFFF ){
271 return 0xFFFD;
272 }
273 pStr->c += 2;
274
275 return (
276 (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
277 ((code_point&0x003F)<<10) + /* xxxxxx */
278 (code_point2&0x03FF) /* yy yyyyyyyy */
279 );
280
281 }else{
282 return (int)0xFFFD;
283 }
284
285 /* not reached */
286}
287
288static int writeUtf16(UtfString *pStr, int code, int big_endian){
289 int bytes;
290 unsigned char *hi_byte;
291 unsigned char *lo_byte;
292
293 bytes = (code>0x0000FFFF?4:2);
294
295 /* Ensure there is enough room left in the output buffer to write
296 ** this UTF-8 character.
297 */
298 assert( (pStr->n-pStr->c)>=bytes );
299
300 /* Initialise hi_byte and lo_byte to point at the locations into which
301 ** the MSB and LSB of the (first) 16-bit unicode code-point written for
302 ** this character.
303 */
304 hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
305 lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
306
307 if( bytes==2 ){
308 *hi_byte = (u8)((code&0x0000FF00)>>8);
309 *lo_byte = (u8)(code&0x000000FF);
310 }else{
311 u32 wrd;
312 wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800;
313 *hi_byte = (u8)((wrd&0x0000FF00)>>8);
314 *lo_byte = (u8)(wrd&0x000000FF);
315
316 wrd = (code&0x000003FF)|0x0000DC00;
317 *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
318 *(lo_byte+2) = (u8)(wrd&0x000000FF);
319 }
320
321 pStr->c += bytes;
322
323 return 0;
324}
325
326/*
327** Return the number of bytes up to (but not including) the first \u0000
328** character in *pStr.
329*/
330static int utf16Bytelen(const unsigned char *pZ){
331 const unsigned char *pC1 = pZ;
332 const unsigned char *pC2 = pZ+1;
333 while( *pC1 || *pC2 ){
334 pC1 += 2;
335 pC2 += 2;
336 }
337 return pC1-pZ;
338}
339
drha5d14fe2004-05-04 15:00:46 +0000340/*
341** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
342** "BOM") into a UTF-8 string. The UTF-8 string is written into space
danielk1977998b56c2004-05-06 23:37:52 +0000343** obtained from sqlite3Malloc() and must be released by the calling function.
drha5d14fe2004-05-04 15:00:46 +0000344**
345** The parameter N is the number of bytes in the UTF-16 string. If N is
346** negative, the entire string up to the first \u0000 character is translated.
347**
348** The returned UTF-8 string is always \000 terminated.
349*/
350unsigned char *sqlite3utf16to8(const void *pData, int N){
danielk1977998b56c2004-05-06 23:37:52 +0000351 UtfString in;
352 UtfString out;
353 int big_endian;
354
355 out.pZ = 0;
356
357 in.pZ = (unsigned char *)pData;
358 in.n = N;
359 in.c = 0;
360
361 if( in.n<0 ){
362 in.n = utf16Bytelen(in.pZ);
363 }
364
365 /* A UTF-8 encoding of a unicode string can require at most 1.5 times as
366 ** much space to store as the same string encoded using UTF-16. Allocate
367 ** this now.
368 */
369 out.n = (in.n*1.5) + 1;
370 out.pZ = sqliteMalloc(in.n);
371 if( !out.pZ ){
372 return 0;
373 }
374 out.c = 0;
375
376 big_endian = readUtf16Bom(&in);
377 while( in.c<in.n ){
378 writeUtf8(&out, readUtf16(&in, big_endian));
379 }
380
381 /* Add the NULL-terminator character */
382 assert( out.c<out.n );
383 out.pZ[out.c] = 0x00;
384
385 return out.pZ;
386}
387
388static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){
389 UtfString in;
390 UtfString out;
391
392 in.pZ = (unsigned char *)pIn;
393 in.n = N;
394 in.c = 0;
395
396 if( in.n<0 ){
397 in.n = strlen(in.pZ);
398 }
399
400 /* A UTF-16 encoding of a unicode string can require at most twice as
401 ** much space to store as the same string encoded using UTF-8. Allocate
402 ** this now.
403 */
404 out.n = (in.n*2) + 2;
405 out.pZ = sqliteMalloc(in.n);
406 if( !out.pZ ){
407 return 0;
408 }
409 out.c = 0;
410
411 while( in.c<in.n ){
412 writeUtf16(&out, readUtf8(&in), big_endian);
413 }
414
415 /* Add the NULL-terminator character */
416 assert( (out.c+1)<out.n );
417 out.pZ[out.c] = 0x00;
418 out.pZ[out.c+1] = 0x00;
419
420 return out.pZ;
421}
422
423/*
424** Translate UTF-8 to UTF-16BE or UTF-16LE
425*/
426void *sqlite3utf8to16be(const unsigned char *pIn, int N){
427 return utf8toUtf16(pIn, N, 1);
428}
429
430void *sqlite3utf8to16le(const unsigned char *pIn, int N){
431 return utf8toUtf16(pIn, N, 0);
432}
433
434/*
435** This routine does the work for sqlite3utf16to16le() and
436** sqlite3utf16to16be(). If big_endian is 1 the input string is
437** transformed in place to UTF-16BE encoding. If big_endian is 0 then
438** the input is transformed to UTF-16LE.
439**
440** Unless the first two bytes of the input string is a BOM, the input is
441** assumed to be UTF-16 encoded using the machines native byte ordering.
442*/
443static void utf16to16(void *pData, int N, int big_endian){
444 UtfString inout;
445 inout.pZ = (unsigned char *)pData;
446 inout.c = 0;
447 inout.n = N;
448
449 if( inout.n<0 ){
450 inout.n = utf16Bytelen(inout.pZ);
451 }
452
453 if( readUtf16Bom(&inout)!=big_endian ){
454 swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c);
455 }else if( inout.c ){
456 memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
457 }
drha5d14fe2004-05-04 15:00:46 +0000458}
459
460/*
461** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
462** string. The conversion occurs in-place. The output overwrites the
463** input. N bytes are converted. If N is negative everything is converted
464** up to the first \u0000 character.
465**
466** If the native byte order is little-endian and there is no BOM, then
467** this routine is a no-op. If there is a BOM at the start of the string,
468** it is removed.
danielk1977998b56c2004-05-06 23:37:52 +0000469**
drha5d14fe2004-05-04 15:00:46 +0000470** Translation from UTF-16LE to UTF-16BE and back again is accomplished
471** using the library function swab().
472*/
danielk1977998b56c2004-05-06 23:37:52 +0000473void sqlite3utf16to16le(void *pData, int N){
474 utf16to16(pData, N, 0);
475}
drha5d14fe2004-05-04 15:00:46 +0000476
477/*
danielk1977998b56c2004-05-06 23:37:52 +0000478** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
479** string. The conversion occurs in-place. The output overwrites the
480** input. N bytes are converted. If N is negative everything is converted
481** up to the first \u0000 character.
482**
483** If the native byte order is little-endian and there is no BOM, then
484** this routine is a no-op. If there is a BOM at the start of the string,
485** it is removed.
486**
487** Translation from UTF-16LE to UTF-16BE and back again is accomplished
488** using the library function swab().
drha5d14fe2004-05-04 15:00:46 +0000489*/
danielk1977998b56c2004-05-06 23:37:52 +0000490void sqlite3utf16to16be(void *pData, int N){
491 utf16to16(pData, N, 1);
drha5d14fe2004-05-04 15:00:46 +0000492}
danielk1977998b56c2004-05-06 23:37:52 +0000493