Blame - src/utf.c - chromium.googlesource.com/chromium/deps/sqlite

2004-05-04 15:00:46 +0000

[diff] [blame]

/*

** 2004 April 13

**

** The author disclaims copyright to this source code. In place of

5

** a legal notice, here is a blessing:

6

**

7

** May you do good and not evil.

8

** May you find forgiveness for yourself and forgive others.

9

** May you share freely, never taking more than you give.

10

**

11

*************************************************************************

12

** This file contains routines used to translate between UTF-8,

13

** UTF-16, UTF-16BE, and UTF-16LE.

14

**

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

15

** $Id: utf.c,v 1.43 2006/10/19 01:58:44 drh Exp $

drh

2004-05-04 15:00:46 +0000

[diff] [blame]

**

** Notes on UTF-8:

**

** Byte-0 Byte-1 Byte-2 Byte-3 Value

20

** 0xxxxxxx 00000000 00000000 0xxxxxxx

21

** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx

22

** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx

23

** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx

24

**

25

**

26

** Notes on UTF-16: (with wwww+1==uuuuu)

27

**

drh

51846b5

2004-05-28 16:00:21 +0000

[diff] [blame]

28

** Word-0 Word-1 Value

29

** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx

30

** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx

drh

2004-05-04 15:00:46 +0000

[diff] [blame]

31

**

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

32

**

drh

2004-05-04 15:00:46 +0000

[diff] [blame]

33

** BOM or Byte Order Mark:

34

** 0xff 0xfe little-endian utf-16 follows

35

** 0xfe 0xff big-endian utf-16 follows

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

36

**

37

**

38

** Handling of malformed strings:

39

**

40

** SQLite accepts and processes malformed strings without an error wherever

41

** possible. However this is not possible when converting between UTF-8 and

42

** UTF-16.

43

**

44

** When converting malformed UTF-8 strings to UTF-16, one instance of the

45

** replacement character U+FFFD for each byte that cannot be interpeted as

46

** part of a valid unicode character.

47

**

48

** When converting malformed UTF-16 strings to UTF-8, one instance of the

49

** replacement character U+FFFD for each pair of bytes that cannot be

50

** interpeted as part of a valid unicode character.

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

51

**

52

** This file contains the following public routines:

53

**

54

** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.

55

** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.

56

** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.

57

** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.

58

** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.

59

**

drh

2004-05-04 15:00:46 +0000

[diff] [blame]

60

*/

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

61

#include "sqliteInt.h"

drh

b659e9b

2005-01-28 01:29:08 +0000

[diff] [blame]

62

#include <assert.h>

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

63

#include "vdbeInt.h"

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

64

65

/*

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

66

** This table maps from the first byte of a UTF-8 character to the number

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

67

** of trailing bytes expected. A value '4' indicates that the table key

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

68

** is not a legal first byte for a UTF-8 character.

danielk1977

d02eb1f

2004-06-06 09:44:03 +0000

[diff] [blame]

69

*/

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

70

static const u8 xtra_utf8_bytes[256] = {

71

/* 0xxxxxxx */

72

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

73

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

74

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

75

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

76

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

77

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

78

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

79

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

danielk1977

d02eb1f

2004-06-06 09:44:03 +0000

[diff] [blame]

80

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

81

/* 10wwwwww */

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

82

4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

83

4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

84

4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

85

4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

danielk1977

ad7dd42

2004-06-06 12:41:49 +0000

[diff] [blame]

86

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

87

/* 110yyyyy */

88

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

89

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

90

91

/* 1110zzzz */

92

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

93

94

/* 11110yyy */

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

95

3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

};

/*

** This table maps from the number of trailing bytes in a UTF-8 character

100

** to an integer constant that is effectively calculated for each character

101

** read by a naive implementation of a UTF-8 character reader. The code

102

** in the READ_UTF8 macro explains things best.

103

*/

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

104

static const int xtra_utf8_bits[] = {

105

0,

106

12416, /* (0xC0 << 6) + (0x80) */

107

925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */

108

63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */

};

/*

** If a UTF-8 character contains N bytes extra bytes (N bytes follow

113

** the initial byte so that the total character length is N+1) then

114

** masking the character with utf8_mask[N] must produce a non-zero

115

** result. Otherwise, we have an (illegal) overlong encoding.

116

*/

117

static const int utf_mask[] = {

0x00000000,

0xffffff80,

0xfffff800,

0xffff0000,

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

122

};

123

124

#define READ_UTF8(zIn, c) { \

125

int xtra; \

126

c = *(zIn)++; \

127

xtra = xtra_utf8_bytes[c]; \

128

switch( xtra ){ \

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

129

case 4: c = (int)0xFFFD; break; \

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

130

case 3: c = (c<<6) + *(zIn)++; \

131

case 2: c = (c<<6) + *(zIn)++; \

132

case 1: c = (c<<6) + *(zIn)++; \

133

c -= xtra_utf8_bits[xtra]; \

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

134

if( (utf_mask[xtra]&c)==0 \

135

|| (c&0xFFFFF800)==0xD800 \

136

|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

137

} \

138

}

drh

4e5ffc5

2004-08-31 00:52:37 +0000

[diff] [blame]

139

int sqlite3ReadUtf8(const unsigned char *z){

int c;

READ_UTF8(z, c);

return c;

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

144

145

#define SKIP_UTF8(zIn) { \

146

zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1); \

147

}

148

149

#define WRITE_UTF8(zOut, c) { \

150

if( c<0x00080 ){ \

151

*zOut++ = (c&0xFF); \

152

} \

153

else if( c<0x00800 ){ \

154

*zOut++ = 0xC0 + ((c>>6)&0x1F); \

155

*zOut++ = 0x80 + (c & 0x3F); \

156

} \

157

else if( c<0x10000 ){ \

158

*zOut++ = 0xE0 + ((c>>12)&0x0F); \

159

*zOut++ = 0x80 + ((c>>6) & 0x3F); \

160

*zOut++ = 0x80 + (c & 0x3F); \

161

}else{ \

162

*zOut++ = 0xF0 + ((c>>18) & 0x07); \

163

*zOut++ = 0x80 + ((c>>12) & 0x3F); \

164

*zOut++ = 0x80 + ((c>>6) & 0x3F); \

165

*zOut++ = 0x80 + (c & 0x3F); \

} \

}

#define WRITE_UTF16LE(zOut, c) { \

170

if( c<=0xFFFF ){ \

171

*zOut++ = (c&0x00FF); \

172

*zOut++ = ((c>>8)&0x00FF); \

173

}else{ \

174

*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \

175

*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \

176

*zOut++ = (c&0x00FF); \

177

*zOut++ = (0x00DC + ((c>>8)&0x03)); \

} \

}

#define WRITE_UTF16BE(zOut, c) { \

182

if( c<=0xFFFF ){ \

183

*zOut++ = ((c>>8)&0x00FF); \

184

*zOut++ = (c&0x00FF); \

185

}else{ \

186

*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \

187

*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \

188

*zOut++ = (0x00DC + ((c>>8)&0x03)); \

189

*zOut++ = (c&0x00FF); \

} \

}

#define READ_UTF16LE(zIn, c){ \

194

c = (*zIn++); \

195

c += ((*zIn++)<<8); \

196

if( c>=0xD800 && c<=0xE000 ){ \

197

int c2 = (*zIn++); \

198

c2 += ((*zIn++)<<8); \

199

c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

200

if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

} \

}

#define READ_UTF16BE(zIn, c){ \

205

c = ((*zIn++)<<8); \

206

c += (*zIn++); \

207

if( c>=0xD800 && c<=0xE000 ){ \

208

int c2 = ((*zIn++)<<8); \

209

c2 += (*zIn++); \

210

c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

211

if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

} \

}

danielk1977

2004-06-28 13:09:11 +0000

[diff] [blame]

215

#define SKIP_UTF16BE(zIn){ \

216

if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \

zIn += 4; \

}else{ \

zIn += 2; \

} \

}

#define SKIP_UTF16LE(zIn){ \

223

zIn++; \

224

if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \

zIn += 3; \

}else{ \

zIn += 1; \

} \

}

#define RSKIP_UTF16LE(zIn){ \

232

if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \

zIn -= 4; \

}else{ \

zIn -= 2; \

} \

}

#define RSKIP_UTF16BE(zIn){ \

239

zIn--; \

240

if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \

zIn -= 3; \

}else{ \

zIn -= 1; \

} \

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

247

/*

248

** If the TRANSLATE_TRACE macro is defined, the value of each Mem is

249

** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().

250

*/

251

/* #define TRANSLATE_TRACE 1 */

252

drh

2004-11-14 21:56:29 +0000

[diff] [blame]

253

#ifndef SQLITE_OMIT_UTF16

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

254

/*

255

** This routine transforms the internal text encoding used by pMem to

256

** desiredEnc. It is an error if the string is already of the desired

257

** encoding, or if *pMem does not contain a string value.

258

*/

259

int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){

260

unsigned char zShort[NBFS]; /* Temporary short output buffer */

261

int len; /* Maximum length of output string in bytes */

262

unsigned char *zOut; /* Output buffer */

263

unsigned char *zIn; /* Input iterator */

264

unsigned char *zTerm; /* End of input */

265

unsigned char *z; /* Output iterator */

drh

a39f4c5

2006-10-04 15:23:21 +0000

[diff] [blame]

266

unsigned int c;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

267

268

assert( pMem->flags&MEM_Str );

269

assert( pMem->enc!=desiredEnc );

270

assert( pMem->enc!=0 );

271

assert( pMem->n>=0 );

272

danielk1977

b5402fb

2005-01-12 07:15:04 +0000

[diff] [blame]

273

#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

274

{

275

char zBuf[100];

drh

7416170

2006-02-24 02:53:49 +0000

[diff] [blame]

276

sqlite3VdbeMemPrettyPrint(pMem, zBuf);

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

277

fprintf(stderr, "INPUT: %s\n", zBuf);

danielk1977

ad7dd42

2004-06-06 12:41:49 +0000

[diff] [blame]

}

#endif

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

281

/* If the translation is between UTF-16 little and big endian, then

282

** all that is required is to swap the byte order. This case is handled

283

** differently from the others.

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

284

*/

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

285

if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){

286

u8 temp;

drh

71c697e

2004-08-08 23:39:19 +0000

[diff] [blame]

287

int rc;

288

rc = sqlite3VdbeMemMakeWriteable(pMem);

289

if( rc!=SQLITE_OK ){

290

assert( rc==SQLITE_NOMEM );

291

return SQLITE_NOMEM;

292

}

drh

2005-12-09 20:02:05 +0000

[diff] [blame]

293

zIn = (u8*)pMem->z;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

294

zTerm = &zIn[pMem->n];

while( zIn<zTerm ){

temp = *zIn;

*zIn = *(zIn+1);

zIn++;

*zIn++ = temp;

}

pMem->enc = desiredEnc;

goto translate_out;

}

danielk1977

2004-06-23 00:23:49 +0000

[diff] [blame]

305

/* Set len to the maximum number of bytes required in the output buffer. */

306

if( desiredEnc==SQLITE_UTF8 ){

307

/* When converting from UTF-16, the maximum growth results from

drh

a49b861

2006-04-16 12:05:03 +0000

[diff] [blame]

308

** translating a 2-byte character to a 4-byte UTF-8 character.

309

** A single byte is required for the output string

danielk1977

d7e6964

2004-06-23 00:23:49 +0000

[diff] [blame]

310

** nul-terminator.

311

*/

drh

a49b861

2006-04-16 12:05:03 +0000

[diff] [blame]

312

len = pMem->n * 2 + 1;

danielk1977

d7e6964

2004-06-23 00:23:49 +0000

[diff] [blame]

313

}else{

314

/* When converting from UTF-8 to UTF-16 the maximum growth is caused

315

** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16

316

** character. Two bytes are required in the output buffer for the

317

** nul-terminator.

318

*/

319

len = pMem->n * 2 + 2;

320

}

321

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

322

/* Set zIn to point at the start of the input buffer and zTerm to point 1

323

** byte past the end.

324

**

325

** Variable zOut is set to point at the output buffer. This may be space

326

** obtained from malloc(), or Mem.zShort, if it large enough and not in

327

** use, or the zShort array on the stack (see above).

328

*/

drh

2005-12-09 20:02:05 +0000

[diff] [blame]

329

zIn = (u8*)pMem->z;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

330

zTerm = &zIn[pMem->n];

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

331

if( len>NBFS ){

332

zOut = sqliteMallocRaw(len);

333

if( !zOut ) return SQLITE_NOMEM;

334

}else{

danielk1977

2004-06-23 13:46:32 +0000

[diff] [blame]

335

zOut = zShort;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

}

z = zOut;

if( pMem->enc==SQLITE_UTF8 ){

340

if( desiredEnc==SQLITE_UTF16LE ){

341

/* UTF-8 -> UTF-16 Little-endian */

while( zIn<zTerm ){

READ_UTF8(zIn, c);

WRITE_UTF16LE(z, c);

}

drh

2004-09-24 23:20:51 +0000

[diff] [blame]

346

}else{

347

assert( desiredEnc==SQLITE_UTF16BE );

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

348

/* UTF-8 -> UTF-16 Big-endian */

while( zIn<zTerm ){

READ_UTF8(zIn, c);

WRITE_UTF16BE(z, c);

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

353

}

drh

2004-09-24 23:20:51 +0000

[diff] [blame]

354

pMem->n = z - zOut;

355

*z++ = 0;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

356

}else{

357

assert( desiredEnc==SQLITE_UTF8 );

358

if( pMem->enc==SQLITE_UTF16LE ){

359

/* UTF-16 Little-endian -> UTF-8 */

360

while( zIn<zTerm ){

361

READ_UTF16LE(zIn, c);

362

WRITE_UTF8(z, c);

363

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

364

}else{

365

/* UTF-16 Little-endian -> UTF-8 */

366

while( zIn<zTerm ){

367

READ_UTF16BE(zIn, c);

368

WRITE_UTF8(z, c);

369

}

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

370

}

drh

2004-09-24 23:20:51 +0000

[diff] [blame]

371

pMem->n = z - zOut;

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

372

}

drh

2004-09-24 23:20:51 +0000

[diff] [blame]

373

*z = 0;

danielk1977

d7e6964

2004-06-23 00:23:49 +0000

[diff] [blame]

374

assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

375

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

376

sqlite3VdbeMemRelease(pMem);

377

pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);

378

pMem->enc = desiredEnc;

danielk1977

2004-06-23 13:46:32 +0000

[diff] [blame]

379

if( zOut==zShort ){

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

380

memcpy(pMem->zShort, zOut, len);

drh

2005-12-09 20:02:05 +0000

[diff] [blame]

381

zOut = (u8*)pMem->zShort;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

382

pMem->flags |= (MEM_Term|MEM_Short);

383

}else{

384

pMem->flags |= (MEM_Term|MEM_Dyn);

385

}

drh

2005-12-09 20:02:05 +0000

[diff] [blame]

386

pMem->z = (char*)zOut;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

387

388

translate_out:

danielk1977

b5402fb

2005-01-12 07:15:04 +0000

[diff] [blame]

389

#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

390

{

391

char zBuf[100];

drh

7416170

2006-02-24 02:53:49 +0000

[diff] [blame]

392

sqlite3VdbeMemPrettyPrint(pMem, zBuf);

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

393

fprintf(stderr, "OUTPUT: %s\n", zBuf);

394

}

395

#endif

396

return SQLITE_OK;

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

397

}

398

danielk1977

2004-05-23 13:30:58 +0000

[diff] [blame]

399

/*

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

400

** This routine checks for a byte-order mark at the beginning of the

401

** UTF-16 string stored in *pMem. If one is present, it is removed and

402

** the encoding of the Mem adjusted. This routine does not do any

403

** byte-swapping, it just sets Mem.enc appropriately.

404

**

405

** The allocation (static, dynamic etc.) and encoding of the Mem may be

406

** changed by this function.

danielk1977

2004-05-23 13:30:58 +0000

[diff] [blame]

407

*/

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

408

int sqlite3VdbeMemHandleBom(Mem *pMem){

int rc = SQLITE_OK;

u8 bom = 0;

if( pMem->n<0 || pMem->n>1 ){

413

u8 b1 = *(u8 *)pMem->z;

414

u8 b2 = *(((u8 *)pMem->z) + 1);

danielk1977

2004-05-23 13:30:58 +0000

[diff] [blame]

415

if( b1==0xFE && b2==0xFF ){

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

416

bom = SQLITE_UTF16BE;

danielk1977

2004-05-23 13:30:58 +0000

[diff] [blame]

417

}

418

if( b1==0xFF && b2==0xFE ){

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

419

bom = SQLITE_UTF16LE;

danielk1977

2004-05-23 13:30:58 +0000

[diff] [blame]

420

}

421

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

422

423

if( bom ){

danielk1977

2004-06-23 13:46:32 +0000

[diff] [blame]

424

/* This function is called as soon as a string is stored in a Mem*,

425

** from within sqlite3VdbeMemSetStr(). At that point it is not possible

426

** for the string to be stored in Mem.zShort, or for it to be stored

427

** in dynamic memory with no destructor.

428

*/

429

assert( !(pMem->flags&MEM_Short) );

430

assert( !(pMem->flags&MEM_Dyn) || pMem->xDel );

431

if( pMem->flags & MEM_Dyn ){

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

432

void (*xDel)(void*) = pMem->xDel;

char *z = pMem->z;

pMem->z = 0;

pMem->xDel = 0;

rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);

danielk1977

2004-06-23 13:46:32 +0000

[diff] [blame]

437

xDel(z);

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

438

}else{

439

rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,

440

SQLITE_TRANSIENT);

441

}

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

442

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

443

return rc;

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

444

}

drh

2004-11-14 21:56:29 +0000

[diff] [blame]

445

#endif /* SQLITE_OMIT_UTF16 */

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

446

447

/*

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

448

** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,

449

** return the number of unicode characters in pZ up to (but not including)

450

** the first 0x00 byte. If nByte is not less than zero, return the

451

** number of unicode characters in the first nByte of pZ (or up to

452

** the first 0x00, whichever comes first).

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

453

*/

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

454

int sqlite3utf8CharLen(const char *z, int nByte){

455

int r = 0;

456

const char *zTerm;

danielk1977

2004-06-23 13:46:32 +0000

[diff] [blame]

457

if( nByte>=0 ){

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

458

zTerm = &z[nByte];

459

}else{

460

zTerm = (const char *)(-1);

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

461

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

462

assert( z<=zTerm );

463

while( *z!=0 && z<zTerm ){

SKIP_UTF8(z);

r++;

}

return r;

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

468

}

469

drh

2004-11-14 21:56:29 +0000

[diff] [blame]

470

#ifndef SQLITE_OMIT_UTF16

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

471

/*

drh

af9a7c2

2005-12-15 03:04:10 +0000

[diff] [blame]

472

** Convert a UTF-16 string in the native encoding into a UTF-8 string.

473

** Memory to hold the UTF-8 string is obtained from malloc and must be

474

** freed by the calling function.

475

**

476

** NULL is returned if there is an allocation error.

477

*/

478

char *sqlite3utf16to8(const void *z, int nByte){

479

Mem m;

480

memset(&m, 0, sizeof(m));

481

sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC);

482

sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);

drh

66f4a06

2006-07-26 14:57:30 +0000

[diff] [blame]

483

assert( (m.flags & MEM_Term)!=0 || sqlite3MallocFailed() );

484

assert( (m.flags & MEM_Str)!=0 || sqlite3MallocFailed() );

danielk1977

e725929

2006-01-13 06:33:23 +0000

[diff] [blame]

485

return (m.flags & MEM_Dyn)!=0 ? m.z : sqliteStrDup(m.z);

drh

af9a7c2

2005-12-15 03:04:10 +0000

[diff] [blame]

486

}

487

488

/*

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

489

** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,

490

** return the number of bytes up to (but not including), the first pair

491

** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,

492

** then return the number of bytes in the first nChar unicode characters

493

** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).

494

*/

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

495

int sqlite3utf16ByteLen(const void *zIn, int nChar){

drh

a39f4c5

2006-10-04 15:23:21 +0000

[diff] [blame]

496

unsigned int c = 1;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

497

char const *z = zIn;

498

int n = 0;

499

if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){

danielk1977

161fb79

2006-01-24 10:58:21 +0000

[diff] [blame]

500

/* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here

501

** and in other parts of this file means that at one branch will

502

** not be covered by coverage testing on any single host. But coverage

503

** will be complete if the tests are run on both a little-endian and

504

** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE

505

** macros are constant at compile time the compiler can determine

506

** which branch will be followed. It is therefore assumed that no runtime

507

** penalty is paid for this "if" statement.

508

*/

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

509

while( c && ((nChar<0) || n<nChar) ){

510

READ_UTF16BE(z, c);

511

n++;

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

512

}

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

513

}else{

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

514

while( c && ((nChar<0) || n<nChar) ){

515

READ_UTF16LE(z, c);

516

n++;

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

517

}

danielk1977

2004-05-20 11:00:52 +0000

[diff] [blame]

518

}

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

519

return (z-(char const *)zIn)-((c==0)?2:0);

danielk1977

2004-05-06 23:37:52 +0000

[diff] [blame]

520

}

521

drh

2004-05-04 15:00:46 +0000

[diff] [blame]

522

/*

danielk1977

f461889

2004-06-28 13:09:11 +0000

[diff] [blame]

523

** UTF-16 implementation of the substr()

524

*/

525

void sqlite3utf16Substr(

526

sqlite3_context *context,

int argc,

sqlite3_value **argv

){

int y, z;

unsigned char const *zStr;

532

unsigned char const *zStrEnd;

533

unsigned char const *zStart;

534

unsigned char const *zEnd;

535

int i;

536

537

zStr = (unsigned char const *)sqlite3_value_text16(argv[0]);

538

zStrEnd = &zStr[sqlite3_value_bytes16(argv[0])];

539

y = sqlite3_value_int(argv[1]);

540

z = sqlite3_value_int(argv[2]);

if( y>0 ){

y = y-1;

zStart = zStr;

if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){

546

for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16BE(zStart);

547

}else{

548

for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16LE(zStart);

}

}else{

zStart = zStrEnd;

if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){

553

for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16BE(zStart);

554

}else{

555

for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16LE(zStart);

556

}

557

for(; i<0; i++) z -= 1;

}

zEnd = zStart;

if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){

562

for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16BE(zEnd);

563

}else{

564

for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16LE(zEnd);

565

}

566

567

sqlite3_result_text16(context, zStart, zEnd-zStart, SQLITE_TRANSIENT);

568

}

569

drh

38f8271

2004-06-18 17:10:16 +0000

[diff] [blame]

570

#if defined(SQLITE_TEST)

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

571

/*

572

** This routine is called from the TCL test function "translate_selftest".

573

** It checks that the primitives for serializing and deserializing

574

** characters in each encoding are inverses of each other.

575

*/

576

void sqlite3utfSelfTest(){

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

577

unsigned int i, t;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

578

unsigned char zBuf[20];

579

unsigned char *z;

580

int n;

drh

a39f4c5

2006-10-04 15:23:21 +0000

[diff] [blame]

581

unsigned int c;

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

582

danielk1977

2004-06-23 13:46:32 +0000

[diff] [blame]

583

for(i=0; i<0x00110000; i++){

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

z = zBuf;

WRITE_UTF8(z, i);

n = z-zBuf;

z = zBuf;

READ_UTF8(z, c);

drh

2006-10-19 01:58:43 +0000

[diff] [blame^]

589

t = i;

590

if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;

591

if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;

592

assert( c==t );

danielk1977

2004-06-18 04:24:54 +0000

[diff] [blame]

593

assert( (z-zBuf)==n );

594

}

595

for(i=0; i<0x00110000; i++){

596

if( i>=0xD800 && i<=0xE000 ) continue;

z = zBuf;

WRITE_UTF16LE(z, i);

n = z-zBuf;

z = zBuf;

READ_UTF16LE(z, c);

assert( c==i );

assert( (z-zBuf)==n );

604

}

605

for(i=0; i<0x00110000; i++){

606

if( i>=0xD800 && i<=0xE000 ) continue;

z = zBuf;

WRITE_UTF16BE(z, i);

n = z-zBuf;

z = zBuf;

READ_UTF16BE(z, c);

assert( c==i );

assert( (z-zBuf)==n );

614

}

615

}

drh