Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

blob: d9c7e96d7ab6ed2c7c9461a3870d11af891ef404 [file] [log] [blame]

drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	1	/*
				2	** 2004 April 13
				3	**
				4	** The author disclaims copyright to this source code. In place of
				5	** a legal notice, here is a blessing:
				6	**
				7	** May you do good and not evil.
				8	** May you find forgiveness for yourself and forgive others.
				9	** May you share freely, never taking more than you give.
				10	**
				11	*************************************************************************
				12	** This file contains routines used to translate between UTF-8,
				13	** UTF-16, UTF-16BE, and UTF-16LE.
				14	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	15	** $Id: utf.c,v 1.2 2004/05/06 23:37:53 danielk1977 Exp $
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	16	**
				17	** Notes on UTF-8:
				18	**
				19	** Byte-0 Byte-1 Byte-2 Byte-3 Value
				20	** 0xxxxxxx 00000000 00000000 0xxxxxxx
				21	** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
				22	** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
				23	** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
				24	**
				25	**
				26	** Notes on UTF-16: (with wwww+1==uuuuu)
				27	**
				28	** Word-0 Word-1 Value
				29	** 110110wwwwxxxxxx 110111yyyyyyyyyy 000uuuuu xxxxxxyy yyyyyyyy
				30	** xxxxxxxxyyyyyyyy 00000000 xxxxxxxx yyyyyyyy
				31	**
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	32	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	33	** BOM or Byte Order Mark:
				34	** 0xff 0xfe little-endian utf-16 follows
				35	** 0xfe 0xff big-endian utf-16 follows
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	36	**
				37	**
				38	** Handling of malformed strings:
				39	**
				40	** SQLite accepts and processes malformed strings without an error wherever
				41	** possible. However this is not possible when converting between UTF-8 and
				42	** UTF-16.
				43	**
				44	** When converting malformed UTF-8 strings to UTF-16, one instance of the
				45	** replacement character U+FFFD for each byte that cannot be interpeted as
				46	** part of a valid unicode character.
				47	**
				48	** When converting malformed UTF-16 strings to UTF-8, one instance of the
				49	** replacement character U+FFFD for each pair of bytes that cannot be
				50	** interpeted as part of a valid unicode character.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	51	*/
				52
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	53	#include <assert.h>
				54	#include <unistd.h>
				55	#include "sqliteInt.h"
				56
				57	typedef struct UtfString UtfString;
				58	struct UtfString {
				59	unsigned char pZ; / Raw string data */
				60	int n; /* Allocated length of pZ in bytes */
				61	int c; /* Number of pZ bytes already read or written */
				62	};
				63
				64	/* TODO: Implement this macro in os.h. It should be 1 on big-endian
				65	** machines, and 0 on little-endian.
				66	*/
				67	#define SQLITE3_NATIVE_BIGENDIAN 0
				68
				69	#if SQLITE3_NATIVE_BIGENDIAN == 1
				70	#define BOM_BIGENDIAN 0x0000FFFE
				71	#define BOM_LITTLEENDIAN 0x0000FEFF
				72	#else
				73	#define BOM_BIGENDIAN 0x0000FEFF
				74	#define BOM_LITTLEENDIAN 0x0000FFFE
				75	#endif
				76
				77	/*
				78	** These two macros are used to interpret the first two bytes of the
				79	** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
				80	** interpretation, LE16() for little-endian.
				81	*/
				82	#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
				83	#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
				84
				85	/*
				86	** READ_16 interprets the first two bytes of the unsigned char array pZ
				87	** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
				88	** is big-endian, otherwise little-endian.
				89	*/
				90	#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
				91
				92	/*
				93	** Read the BOM from the start of *pStr, if one is present. Return zero
				94	** for little-endian, non-zero for big-endian. If no BOM is present, return
				95	** the machines native byte order.
				96	**
				97	** Return values:
				98	** 1 -> big-endian string
				99	** 0 -> little-endian string
				100	*/
				101	static int readUtf16Bom(UtfString *pStr){
				102	/* The BOM must be the first thing read from the string */
				103	assert( pStr->c==0 );
				104
				105	/* If the string data consists of 1 byte or less, the BOM will make no
				106	** difference anyway. In this case just fall through to the default case
				107	** and return the native byte-order for this machine.
				108	**
				109	** Otherwise, check the first 2 bytes of the string to see if a BOM is
				110	** present.
				111	*/
				112	if( pStr->n>1 ){
				113	u32 bom = BE16(pStr->pZ);
				114	if( bom==BOM_BIGENDIAN ){
				115	pStr->c = 2;
				116	return 1;
				117	}
				118	if( bom==BOM_LITTLEENDIAN ){
				119	pStr->c = 2;
				120	return 0;
				121	}
				122	}
				123
				124	return SQLITE3_NATIVE_BIGENDIAN;
				125	}
				126
				127
				128	/*
				129	** Read a single unicode character from the UTF-8 encoded string *pStr. The
				130	** value returned is a unicode scalar value. In the case of malformed
				131	** strings, the unicode replacement character U+FFFD may be returned.
				132	*/
				133	static u32 readUtf8(UtfString *pStr){
				134	struct Utf8TblRow {
				135	u8 b1_mask;
				136	u8 b1_masked_val;
				137	u8 b1_value_mask;
				138	int trailing_bytes;
				139	};
				140	static const struct Utf8TblRow utf8tbl[] = {
				141	{ 0x80, 0x00, 0x7F, 0 },
				142	{ 0xE0, 0xC0, 0x1F, 1 },
				143	{ 0xF0, 0xE0, 0x0F, 2 },
				144	{ 0xF8, 0xF0, 0x0E, 3 },
				145	{ 0, 0, 0, 0}
				146	};
				147
				148	u8 b1; /* First byte of the potentially multi-byte utf-8 character */
				149	u32 ret = 0; /* Return value */
				150	int ii;
				151	struct Utf8TblRow const *pRow;
				152
				153	pRow = &(utf8tbl[0]);
				154
				155	b1 = pStr->pZ[pStr->c];
				156	pStr->c++;
				157	while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
				158	pRow++;
				159	}
				160	if( !pRow->b1_mask ){
				161	return 0xFFFD;
				162	}
				163
				164	ret = (u32)(b1&pRow->b1_value_mask);
				165	for( ii=0; ii<pRow->trailing_bytes; ii++ ){
				166	u8 b = pStr->pZ[pStr->c+ii];
				167	if( (b&0xC0)!=0x80 ){
				168	return 0xFFFD;
				169	}
				170	ret = (ret<<6) + (u32)(b&0x3F);
				171	}
				172
				173	pStr->c += pRow->trailing_bytes;
				174	return ret;
				175	}
				176
				177	/*
				178	** Write the unicode character 'code' to the string pStr using UTF-8
				179	** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
				180	*/
				181	static int writeUtf8(UtfString *pStr, u32 code){
				182	struct Utf8WriteTblRow {
				183	u32 max_code;
				184	int trailing_bytes;
				185	u8 b1_and_mask;
				186	u8 b1_or_mask;
				187	};
				188	static const struct Utf8WriteTblRow utf8tbl[] = {
				189	{0x0000007F, 0, 0x7F, 0x00},
				190	{0x000007FF, 1, 0xDF, 0xC0},
				191	{0x0000FFFF, 2, 0xEF, 0xE0},
				192	{0x0010FFFF, 3, 0xF7, 0xF0},
				193	{0x00000000, 0, 0x00, 0x00}
				194	};
				195	static const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
				196
				197	while( code<=pRow->max_code ){
				198	assert( pRow->max_code );
				199	pRow++;
				200	}
				201
				202	/* Ensure there is enough room left in the output buffer to write
				203	** this UTF-8 character.
				204	*/
				205	assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
				206
				207	/* Write the UTF-8 encoded character to pStr. All cases below are
				208	** intentionally fall-through.
				209	*/
				210	switch( pRow->trailing_bytes ){
				211	case 3:
				212	pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)\|0x80;
				213	code = code>>6;
				214	case 2:
				215	pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)\|0x80;
				216	code = code>>6;
				217	case 1:
				218	pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)\|0x80;
				219	code = code>>6;
				220	case 0:
				221	pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))\|(pRow->b1_or_mask);
				222	}
				223	pStr->c += (pRow->trailing_bytes + 1);
				224
				225	return 0;
				226	}
				227
				228	/*
				229	** Read a single unicode character from the UTF-16 encoded string *pStr. The
				230	** value returned is a unicode scalar value. In the case of malformed
				231	** strings, the unicode replacement character U+FFFD may be returned.
				232	**
				233	** If big_endian is true, the string is assumed to be UTF-16BE encoded.
				234	** Otherwise, it is UTF-16LE encoded.
				235	*/
				236	static u32 readUtf16(UtfString *pStr, int big_endian){
				237	u32 code_point; /* the first code-point in the character */
				238
				239	/* If there is only one byte of data left in the string, return the
				240	** replacement character.
				241	*/
				242	if( (pStr->n-pStr->c)==1 ){
				243	pStr->c++;
				244	return (int)0xFFFD;
				245	}
				246
				247	code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
				248	pStr->c += 2;
				249
				250	/* If this is a non-surrogate code-point, just cast it to an int and
				251	** return the code-point value.
				252	*/
				253	if( code_point<0xD800 \|\| code_point>0xE000 ){
				254	return code_point;
				255	}
				256
				257	/* If this is a trailing surrogate code-point, then the string is
				258	** malformed; return the replacement character.
				259	*/
				260	if( code_point>0xDBFF ){
				261	return 0xFFFD;
				262	}
				263
				264	/* The code-point just read is a leading surrogate code-point. If their
				265	** is not enough data left or the next code-point is not a trailing
				266	** surrogate, return the replacement character.
				267	*/
				268	if( (pStr->n-pStr->c)>1 ){
				269	u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
				270	if( code_point2<0xDC00 \|\| code_point>0xDFFF ){
				271	return 0xFFFD;
				272	}
				273	pStr->c += 2;
				274
				275	return (
				276	(((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
				277	((code_point&0x003F)<<10) + /* xxxxxx */
				278	(code_point2&0x03FF) /* yy yyyyyyyy */
				279	);
				280
				281	}else{
				282	return (int)0xFFFD;
				283	}
				284
				285	/* not reached */
				286	}
				287
				288	static int writeUtf16(UtfString *pStr, int code, int big_endian){
				289	int bytes;
				290	unsigned char *hi_byte;
				291	unsigned char *lo_byte;
				292
				293	bytes = (code>0x0000FFFF?4:2);
				294
				295	/* Ensure there is enough room left in the output buffer to write
				296	** this UTF-8 character.
				297	*/
				298	assert( (pStr->n-pStr->c)>=bytes );
				299
				300	/* Initialise hi_byte and lo_byte to point at the locations into which
				301	** the MSB and LSB of the (first) 16-bit unicode code-point written for
				302	** this character.
				303	*/
				304	hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
				305	lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
				306
				307	if( bytes==2 ){
				308	*hi_byte = (u8)((code&0x0000FF00)>>8);
				309	*lo_byte = (u8)(code&0x000000FF);
				310	}else{
				311	u32 wrd;
				312	wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)\|0x0000D800;
				313	*hi_byte = (u8)((wrd&0x0000FF00)>>8);
				314	*lo_byte = (u8)(wrd&0x000000FF);
				315
				316	wrd = (code&0x000003FF)\|0x0000DC00;
				317	*(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
				318	*(lo_byte+2) = (u8)(wrd&0x000000FF);
				319	}
				320
				321	pStr->c += bytes;
				322
				323	return 0;
				324	}
				325
				326	/*
				327	** Return the number of bytes up to (but not including) the first \u0000
				328	** character in *pStr.
				329	*/
				330	static int utf16Bytelen(const unsigned char *pZ){
				331	const unsigned char *pC1 = pZ;
				332	const unsigned char *pC2 = pZ+1;
				333	while( pC1 \|\| pC2 ){
				334	pC1 += 2;
				335	pC2 += 2;
				336	}
				337	return pC1-pZ;
				338	}
				339
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	340	/*
				341	** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
				342	** "BOM") into a UTF-8 string. The UTF-8 string is written into space
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	343	** obtained from sqlite3Malloc() and must be released by the calling function.
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	344	**
				345	** The parameter N is the number of bytes in the UTF-16 string. If N is
				346	** negative, the entire string up to the first \u0000 character is translated.
				347	**
				348	** The returned UTF-8 string is always \000 terminated.
				349	*/
				350	unsigned char sqlite3utf16to8(const void pData, int N){
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	351	UtfString in;
				352	UtfString out;
				353	int big_endian;
				354
				355	out.pZ = 0;
				356
				357	in.pZ = (unsigned char *)pData;
				358	in.n = N;
				359	in.c = 0;
				360
				361	if( in.n<0 ){
				362	in.n = utf16Bytelen(in.pZ);
				363	}
				364
				365	/* A UTF-8 encoding of a unicode string can require at most 1.5 times as
				366	** much space to store as the same string encoded using UTF-16. Allocate
				367	** this now.
				368	*/
				369	out.n = (in.n*1.5) + 1;
				370	out.pZ = sqliteMalloc(in.n);
				371	if( !out.pZ ){
				372	return 0;
				373	}
				374	out.c = 0;
				375
				376	big_endian = readUtf16Bom(&in);
				377	while( in.c<in.n ){
				378	writeUtf8(&out, readUtf16(&in, big_endian));
				379	}
				380
				381	/* Add the NULL-terminator character */
				382	assert( out.c<out.n );
				383	out.pZ[out.c] = 0x00;
				384
				385	return out.pZ;
				386	}
				387
				388	static void utf8toUtf16(const unsigned char pIn, int N, int big_endian){
				389	UtfString in;
				390	UtfString out;
				391
				392	in.pZ = (unsigned char *)pIn;
				393	in.n = N;
				394	in.c = 0;
				395
				396	if( in.n<0 ){
				397	in.n = strlen(in.pZ);
				398	}
				399
				400	/* A UTF-16 encoding of a unicode string can require at most twice as
				401	** much space to store as the same string encoded using UTF-8. Allocate
				402	** this now.
				403	*/
				404	out.n = (in.n*2) + 2;
				405	out.pZ = sqliteMalloc(in.n);
				406	if( !out.pZ ){
				407	return 0;
				408	}
				409	out.c = 0;
				410
				411	while( in.c<in.n ){
				412	writeUtf16(&out, readUtf8(&in), big_endian);
				413	}
				414
				415	/* Add the NULL-terminator character */
				416	assert( (out.c+1)<out.n );
				417	out.pZ[out.c] = 0x00;
				418	out.pZ[out.c+1] = 0x00;
				419
				420	return out.pZ;
				421	}
				422
				423	/*
				424	** Translate UTF-8 to UTF-16BE or UTF-16LE
				425	*/
				426	void sqlite3utf8to16be(const unsigned char pIn, int N){
				427	return utf8toUtf16(pIn, N, 1);
				428	}
				429
				430	void sqlite3utf8to16le(const unsigned char pIn, int N){
				431	return utf8toUtf16(pIn, N, 0);
				432	}
				433
				434	/*
				435	** This routine does the work for sqlite3utf16to16le() and
				436	** sqlite3utf16to16be(). If big_endian is 1 the input string is
				437	** transformed in place to UTF-16BE encoding. If big_endian is 0 then
				438	** the input is transformed to UTF-16LE.
				439	**
				440	** Unless the first two bytes of the input string is a BOM, the input is
				441	** assumed to be UTF-16 encoded using the machines native byte ordering.
				442	*/
				443	static void utf16to16(void *pData, int N, int big_endian){
				444	UtfString inout;
				445	inout.pZ = (unsigned char *)pData;
				446	inout.c = 0;
				447	inout.n = N;
				448
				449	if( inout.n<0 ){
				450	inout.n = utf16Bytelen(inout.pZ);
				451	}
				452
				453	if( readUtf16Bom(&inout)!=big_endian ){
				454	swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c);
				455	}else if( inout.c ){
				456	memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
				457	}
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	458	}
				459
				460	/*
				461	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
				462	** string. The conversion occurs in-place. The output overwrites the
				463	** input. N bytes are converted. If N is negative everything is converted
				464	** up to the first \u0000 character.
				465	**
				466	** If the native byte order is little-endian and there is no BOM, then
				467	** this routine is a no-op. If there is a BOM at the start of the string,
				468	** it is removed.
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	469	**
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	470	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				471	** using the library function swab().
				472	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	473	void sqlite3utf16to16le(void *pData, int N){
				474	utf16to16(pData, N, 0);
				475	}
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	476
				477	/*
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	478	** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
				479	** string. The conversion occurs in-place. The output overwrites the
				480	** input. N bytes are converted. If N is negative everything is converted
				481	** up to the first \u0000 character.
				482	**
				483	** If the native byte order is little-endian and there is no BOM, then
				484	** this routine is a no-op. If there is a BOM at the start of the string,
				485	** it is removed.
				486	**
				487	** Translation from UTF-16LE to UTF-16BE and back again is accomplished
				488	** using the library function swab().
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	489	*/
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	490	void sqlite3utf16to16be(void *pData, int N){
				491	utf16to16(pData, N, 1);
drh	a5d14fe	2004-05-04 15:00:46 +0000	[diff] [blame]	492	}
danielk1977	998b56c	2004-05-06 23:37:52 +0000	[diff] [blame^]	493