blob: 951988ed9ca3d1665146aee054b43fd493cfb283 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4**********************************************************************
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07005* Copyright (C) 2002-2016, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00006* Corporation and others. All Rights Reserved.
7**********************************************************************
8* file name: ucnv_u8.c
Jungshik Shin87232d82017-05-13 21:10:13 -07009* encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000010* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002jul01
14* created by: Markus W. Scherer
15*
16* UTF-8 converter implementation. Used to be in ucnv_utf.c.
17*
18* Also, CESU-8 implementation, see UTR 26.
19* The CESU-8 converter uses all the same functions as the
20* UTF-8 converter, with a branch for converting supplementary code points.
21*/
22
23#include "unicode/utypes.h"
24
25#if !UCONFIG_NO_CONVERSION
26
27#include "unicode/ucnv.h"
28#include "unicode/utf.h"
29#include "unicode/utf8.h"
30#include "unicode/utf16.h"
31#include "ucnv_bld.h"
32#include "ucnv_cnv.h"
33#include "cmemory.h"
Jungshik Shinb3189662017-11-07 11:18:34 -080034#include "ustr_imp.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000035
36/* Prototypes --------------------------------------------------------------- */
37
38/* Keep these here to make finicky compilers happy */
39
40U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
41 UErrorCode *err);
42U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
43 UErrorCode *err);
44
45
46/* UTF-8 -------------------------------------------------------------------- */
47
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000048#define MAXIMUM_UCS2 0x0000FFFF
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000049
Jungshik Shinb3189662017-11-07 11:18:34 -080050static const uint32_t offsetsFromUTF8[5] = {0,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000051 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
Jungshik Shinb3189662017-11-07 11:18:34 -080052 (uint32_t) 0x03C82080
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000053};
54
Jungshik Shin (jungshik at google)afd723b2015-01-21 13:24:04 -080055static UBool hasCESU8Data(const UConverter *cnv)
56{
Jungshik Shin70f82502016-01-29 00:32:36 -080057#if UCONFIG_ONLY_HTML_CONVERSION
Jungshik Shin (jungshik at google)afd723b2015-01-21 13:24:04 -080058 return FALSE;
59#else
60 return (UBool)(cnv->sharedData == &_CESU8Data);
61#endif
62}
Jungshik Shin87232d82017-05-13 21:10:13 -070063U_CDECL_BEGIN
64static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000065 UErrorCode * err)
66{
67 UConverter *cnv = args->converter;
68 const unsigned char *mySource = (unsigned char *) args->source;
69 UChar *myTarget = args->target;
70 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
71 const UChar *targetLimit = args->targetLimit;
72 unsigned char *toUBytes = cnv->toUBytes;
Jungshik Shin (jungshik at google)afd723b2015-01-21 13:24:04 -080073 UBool isCESU8 = hasCESU8Data(cnv);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000074 uint32_t ch, ch2 = 0;
75 int32_t i, inBytes;
Jungshik Shin (jungshik at google)afd723b2015-01-21 13:24:04 -080076
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000077 /* Restore size of current sequence */
78 if (cnv->toUnicodeStatus && myTarget < targetLimit)
79 {
80 inBytes = cnv->mode; /* restore # of bytes to consume */
81 i = cnv->toULength; /* restore # of bytes consumed */
82 cnv->toULength = 0;
83
84 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
85 cnv->toUnicodeStatus = 0;
86 goto morebytes;
87 }
88
89
90 while (mySource < sourceLimit && myTarget < targetLimit)
91 {
92 ch = *(mySource++);
Jungshik Shinb3189662017-11-07 11:18:34 -080093 if (U8_IS_SINGLE(ch)) /* Simple case */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000094 {
95 *(myTarget++) = (UChar) ch;
96 }
97 else
98 {
99 /* store the first char */
100 toUBytes[0] = (char)ch;
Jungshik Shinb3189662017-11-07 11:18:34 -0800101 inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000102 i = 1;
103
104morebytes:
105 while (i < inBytes)
106 {
107 if (mySource < sourceLimit)
108 {
109 toUBytes[i] = (char) (ch2 = *mySource);
Jungshik Shinb3189662017-11-07 11:18:34 -0800110 if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
111 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000112 {
113 break; /* i < inBytes */
114 }
115 ch = (ch << 6) + ch2;
116 ++mySource;
117 i++;
118 }
119 else
120 {
121 /* stores a partially calculated target*/
122 cnv->toUnicodeStatus = ch;
123 cnv->mode = inBytes;
124 cnv->toULength = (int8_t) i;
125 goto donefornow;
126 }
127 }
128
Jungshik Shinb3189662017-11-07 11:18:34 -0800129 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
130 if (i == inBytes && (!isCESU8 || i <= 3))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000131 {
Jungshik Shinb3189662017-11-07 11:18:34 -0800132 /* Remove the accumulated high bits */
133 ch -= offsetsFromUTF8[inBytes];
134
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000135 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
136 if (ch <= MAXIMUM_UCS2)
137 {
138 /* fits in 16 bits */
139 *(myTarget++) = (UChar) ch;
140 }
141 else
142 {
143 /* write out the surrogates */
Jungshik Shinb3189662017-11-07 11:18:34 -0800144 *(myTarget++) = U16_LEAD(ch);
145 ch = U16_TRAIL(ch);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000146 if (myTarget < targetLimit)
147 {
148 *(myTarget++) = (UChar)ch;
149 }
150 else
151 {
152 /* Put in overflow buffer (not handled here) */
153 cnv->UCharErrorBuffer[0] = (UChar) ch;
154 cnv->UCharErrorBufferLength = 1;
155 *err = U_BUFFER_OVERFLOW_ERROR;
156 break;
157 }
158 }
159 }
160 else
161 {
162 cnv->toULength = (int8_t)i;
163 *err = U_ILLEGAL_CHAR_FOUND;
164 break;
165 }
166 }
167 }
168
169donefornow:
170 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
171 {
172 /* End of target buffer */
173 *err = U_BUFFER_OVERFLOW_ERROR;
174 }
175
176 args->target = myTarget;
177 args->source = (const char *) mySource;
178}
179
Jungshik Shin87232d82017-05-13 21:10:13 -0700180static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000181 UErrorCode * err)
182{
183 UConverter *cnv = args->converter;
184 const unsigned char *mySource = (unsigned char *) args->source;
185 UChar *myTarget = args->target;
186 int32_t *myOffsets = args->offsets;
187 int32_t offsetNum = 0;
188 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
189 const UChar *targetLimit = args->targetLimit;
190 unsigned char *toUBytes = cnv->toUBytes;
Jungshik Shin (jungshik at google)afd723b2015-01-21 13:24:04 -0800191 UBool isCESU8 = hasCESU8Data(cnv);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000192 uint32_t ch, ch2 = 0;
193 int32_t i, inBytes;
194
195 /* Restore size of current sequence */
196 if (cnv->toUnicodeStatus && myTarget < targetLimit)
197 {
198 inBytes = cnv->mode; /* restore # of bytes to consume */
199 i = cnv->toULength; /* restore # of bytes consumed */
200 cnv->toULength = 0;
201
202 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
203 cnv->toUnicodeStatus = 0;
204 goto morebytes;
205 }
206
207 while (mySource < sourceLimit && myTarget < targetLimit)
208 {
209 ch = *(mySource++);
Jungshik Shinb3189662017-11-07 11:18:34 -0800210 if (U8_IS_SINGLE(ch)) /* Simple case */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000211 {
212 *(myTarget++) = (UChar) ch;
213 *(myOffsets++) = offsetNum++;
214 }
215 else
216 {
217 toUBytes[0] = (char)ch;
Jungshik Shinb3189662017-11-07 11:18:34 -0800218 inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000219 i = 1;
220
221morebytes:
222 while (i < inBytes)
223 {
224 if (mySource < sourceLimit)
225 {
226 toUBytes[i] = (char) (ch2 = *mySource);
Jungshik Shinb3189662017-11-07 11:18:34 -0800227 if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
228 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000229 {
230 break; /* i < inBytes */
231 }
232 ch = (ch << 6) + ch2;
233 ++mySource;
234 i++;
235 }
236 else
237 {
238 cnv->toUnicodeStatus = ch;
239 cnv->mode = inBytes;
240 cnv->toULength = (int8_t)i;
241 goto donefornow;
242 }
243 }
244
Jungshik Shinb3189662017-11-07 11:18:34 -0800245 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
246 if (i == inBytes && (!isCESU8 || i <= 3))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000247 {
Jungshik Shinb3189662017-11-07 11:18:34 -0800248 /* Remove the accumulated high bits */
249 ch -= offsetsFromUTF8[inBytes];
250
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000251 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
252 if (ch <= MAXIMUM_UCS2)
253 {
254 /* fits in 16 bits */
255 *(myTarget++) = (UChar) ch;
256 *(myOffsets++) = offsetNum;
257 }
258 else
259 {
260 /* write out the surrogates */
Jungshik Shinb3189662017-11-07 11:18:34 -0800261 *(myTarget++) = U16_LEAD(ch);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000262 *(myOffsets++) = offsetNum;
Jungshik Shinb3189662017-11-07 11:18:34 -0800263 ch = U16_TRAIL(ch);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000264 if (myTarget < targetLimit)
265 {
266 *(myTarget++) = (UChar)ch;
267 *(myOffsets++) = offsetNum;
268 }
269 else
270 {
271 cnv->UCharErrorBuffer[0] = (UChar) ch;
272 cnv->UCharErrorBufferLength = 1;
273 *err = U_BUFFER_OVERFLOW_ERROR;
274 }
275 }
276 offsetNum += i;
277 }
278 else
279 {
280 cnv->toULength = (int8_t)i;
281 *err = U_ILLEGAL_CHAR_FOUND;
282 break;
283 }
284 }
285 }
286
287donefornow:
288 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
289 { /* End of target buffer */
290 *err = U_BUFFER_OVERFLOW_ERROR;
291 }
292
293 args->target = myTarget;
294 args->source = (const char *) mySource;
295 args->offsets = myOffsets;
296}
Jungshik Shin87232d82017-05-13 21:10:13 -0700297U_CDECL_END
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000298
Jungshik Shin87232d82017-05-13 21:10:13 -0700299U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000300 UErrorCode * err)
301{
302 UConverter *cnv = args->converter;
303 const UChar *mySource = args->source;
304 const UChar *sourceLimit = args->sourceLimit;
305 uint8_t *myTarget = (uint8_t *) args->target;
306 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
307 uint8_t *tempPtr;
308 UChar32 ch;
309 uint8_t tempBuf[4];
310 int32_t indexToWrite;
Jungshik Shin (jungshik at google)afd723b2015-01-21 13:24:04 -0800311 UBool isNotCESU8 = !hasCESU8Data(cnv);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000312
313 if (cnv->fromUChar32 && myTarget < targetLimit)
314 {
315 ch = cnv->fromUChar32;
316 cnv->fromUChar32 = 0;
317 goto lowsurrogate;
318 }
319
320 while (mySource < sourceLimit && myTarget < targetLimit)
321 {
322 ch = *(mySource++);
323
324 if (ch < 0x80) /* Single byte */
325 {
326 *(myTarget++) = (uint8_t) ch;
327 }
328 else if (ch < 0x800) /* Double byte */
329 {
330 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
331 if (myTarget < targetLimit)
332 {
333 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
334 }
335 else
336 {
337 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
338 cnv->charErrorBufferLength = 1;
339 *err = U_BUFFER_OVERFLOW_ERROR;
340 }
341 }
342 else {
343 /* Check for surrogates */
344 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
345lowsurrogate:
346 if (mySource < sourceLimit) {
347 /* test both code units */
348 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
349 /* convert and consume this supplementary code point */
350 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
351 ++mySource;
352 /* exit this condition tree */
353 }
354 else {
355 /* this is an unpaired trail or lead code unit */
356 /* callback(illegal) */
357 cnv->fromUChar32 = ch;
358 *err = U_ILLEGAL_CHAR_FOUND;
359 break;
360 }
361 }
362 else {
363 /* no more input */
364 cnv->fromUChar32 = ch;
365 break;
366 }
367 }
368
369 /* Do we write the buffer directly for speed,
370 or do we have to be careful about target buffer space? */
371 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
372
373 if (ch <= MAXIMUM_UCS2) {
374 indexToWrite = 2;
375 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
376 }
377 else {
378 indexToWrite = 3;
379 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
380 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
381 }
382 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
383 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
384
385 if (tempPtr == myTarget) {
386 /* There was enough space to write the codepoint directly. */
387 myTarget += (indexToWrite + 1);
388 }
389 else {
390 /* We might run out of room soon. Write it slowly. */
391 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
392 if (myTarget < targetLimit) {
393 *(myTarget++) = *tempPtr;
394 }
395 else {
396 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
397 *err = U_BUFFER_OVERFLOW_ERROR;
398 }
399 }
400 }
401 }
402 }
403
404 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
405 {
406 *err = U_BUFFER_OVERFLOW_ERROR;
407 }
408
409 args->target = (char *) myTarget;
410 args->source = mySource;
411}
412
Jungshik Shin87232d82017-05-13 21:10:13 -0700413U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000414 UErrorCode * err)
415{
416 UConverter *cnv = args->converter;
417 const UChar *mySource = args->source;
418 int32_t *myOffsets = args->offsets;
419 const UChar *sourceLimit = args->sourceLimit;
420 uint8_t *myTarget = (uint8_t *) args->target;
421 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
422 uint8_t *tempPtr;
423 UChar32 ch;
424 int32_t offsetNum, nextSourceIndex;
425 int32_t indexToWrite;
426 uint8_t tempBuf[4];
Jungshik Shin (jungshik at google)afd723b2015-01-21 13:24:04 -0800427 UBool isNotCESU8 = !hasCESU8Data(cnv);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000428
429 if (cnv->fromUChar32 && myTarget < targetLimit)
430 {
431 ch = cnv->fromUChar32;
432 cnv->fromUChar32 = 0;
433 offsetNum = -1;
434 nextSourceIndex = 0;
435 goto lowsurrogate;
436 } else {
437 offsetNum = 0;
438 }
439
440 while (mySource < sourceLimit && myTarget < targetLimit)
441 {
442 ch = *(mySource++);
443
444 if (ch < 0x80) /* Single byte */
445 {
446 *(myOffsets++) = offsetNum++;
447 *(myTarget++) = (char) ch;
448 }
449 else if (ch < 0x800) /* Double byte */
450 {
451 *(myOffsets++) = offsetNum;
452 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
453 if (myTarget < targetLimit)
454 {
455 *(myOffsets++) = offsetNum++;
456 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
457 }
458 else
459 {
460 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
461 cnv->charErrorBufferLength = 1;
462 *err = U_BUFFER_OVERFLOW_ERROR;
463 }
464 }
465 else
466 /* Check for surrogates */
467 {
468 nextSourceIndex = offsetNum + 1;
469
470 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
471lowsurrogate:
472 if (mySource < sourceLimit) {
473 /* test both code units */
474 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
475 /* convert and consume this supplementary code point */
476 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
477 ++mySource;
478 ++nextSourceIndex;
479 /* exit this condition tree */
480 }
481 else {
482 /* this is an unpaired trail or lead code unit */
483 /* callback(illegal) */
484 cnv->fromUChar32 = ch;
485 *err = U_ILLEGAL_CHAR_FOUND;
486 break;
487 }
488 }
489 else {
490 /* no more input */
491 cnv->fromUChar32 = ch;
492 break;
493 }
494 }
495
496 /* Do we write the buffer directly for speed,
497 or do we have to be careful about target buffer space? */
498 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
499
500 if (ch <= MAXIMUM_UCS2) {
501 indexToWrite = 2;
502 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
503 }
504 else {
505 indexToWrite = 3;
506 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
507 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
508 }
509 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
510 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
511
512 if (tempPtr == myTarget) {
513 /* There was enough space to write the codepoint directly. */
514 myTarget += (indexToWrite + 1);
515 myOffsets[0] = offsetNum;
516 myOffsets[1] = offsetNum;
517 myOffsets[2] = offsetNum;
518 if (indexToWrite >= 3) {
519 myOffsets[3] = offsetNum;
520 }
521 myOffsets += (indexToWrite + 1);
522 }
523 else {
524 /* We might run out of room soon. Write it slowly. */
525 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
526 if (myTarget < targetLimit)
527 {
528 *(myOffsets++) = offsetNum;
529 *(myTarget++) = *tempPtr;
530 }
531 else
532 {
533 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
534 *err = U_BUFFER_OVERFLOW_ERROR;
535 }
536 }
537 }
538 offsetNum = nextSourceIndex;
539 }
540 }
541
542 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
543 {
544 *err = U_BUFFER_OVERFLOW_ERROR;
545 }
546
547 args->target = (char *) myTarget;
548 args->source = mySource;
549 args->offsets = myOffsets;
550}
551
Jungshik Shin87232d82017-05-13 21:10:13 -0700552U_CDECL_BEGIN
553static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000554 UErrorCode *err) {
555 UConverter *cnv;
556 const uint8_t *sourceInitial;
557 const uint8_t *source;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000558 uint8_t myByte;
559 UChar32 ch;
Jungshik Shinb3189662017-11-07 11:18:34 -0800560 int8_t i;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000561
562 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
563
564 cnv = args->converter;
565 sourceInitial = source = (const uint8_t *)args->source;
566 if (source >= (const uint8_t *)args->sourceLimit)
567 {
568 /* no input */
569 *err = U_INDEX_OUTOFBOUNDS_ERROR;
570 return 0xffff;
571 }
572
573 myByte = (uint8_t)*(source++);
Jungshik Shinb3189662017-11-07 11:18:34 -0800574 if (U8_IS_SINGLE(myByte))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000575 {
576 args->source = (const char *)source;
577 return (UChar32)myByte;
578 }
579
Jungshik Shinb3189662017-11-07 11:18:34 -0800580 uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
581 if (countTrailBytes == 0) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000582 cnv->toUBytes[0] = myByte;
583 cnv->toULength = 1;
584 *err = U_ILLEGAL_CHAR_FOUND;
585 args->source = (const char *)source;
586 return 0xffff;
587 }
588
589 /*The byte sequence is longer than the buffer area passed*/
Jungshik Shinb3189662017-11-07 11:18:34 -0800590 if (((const char *)source + countTrailBytes) > args->sourceLimit)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000591 {
592 /* check if all of the remaining bytes are trail bytes */
Jungshik Shinb3189662017-11-07 11:18:34 -0800593 uint16_t extraBytesToWrite = countTrailBytes + 1;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000594 cnv->toUBytes[0] = myByte;
595 i = 1;
596 *err = U_TRUNCATED_CHAR_FOUND;
597 while(source < (const uint8_t *)args->sourceLimit) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800598 uint8_t b = *source;
599 if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
600 cnv->toUBytes[i++] = b;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000601 ++source;
602 } else {
603 /* error even before we run out of input */
604 *err = U_ILLEGAL_CHAR_FOUND;
605 break;
606 }
607 }
608 cnv->toULength = i;
609 args->source = (const char *)source;
610 return 0xffff;
611 }
612
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000613 ch = myByte << 6;
Jungshik Shinb3189662017-11-07 11:18:34 -0800614 if(countTrailBytes == 2) {
615 uint8_t t1 = *source, t2;
616 if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
617 args->source = (const char *)(source + 1);
618 return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000619 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800620 } else if(countTrailBytes == 1) {
621 uint8_t t1 = *source;
622 if(U8_IS_TRAIL(t1)) {
623 args->source = (const char *)(source + 1);
624 return (ch + t1) - offsetsFromUTF8[2];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000625 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800626 } else { // countTrailBytes == 3
627 uint8_t t1 = *source, t2, t3;
628 if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
629 U8_IS_TRAIL(t3 = *++source)) {
630 args->source = (const char *)(source + 1);
631 return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000632 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000633 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800634 args->source = (const char *)source;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000635
636 for(i = 0; sourceInitial < source; ++i) {
637 cnv->toUBytes[i] = *sourceInitial++;
638 }
639 cnv->toULength = i;
640 *err = U_ILLEGAL_CHAR_FOUND;
641 return 0xffff;
642}
Jungshik Shin87232d82017-05-13 21:10:13 -0700643U_CDECL_END
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000644
645/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
646
Jungshik Shin87232d82017-05-13 21:10:13 -0700647U_CDECL_BEGIN
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000648/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
Jungshik Shin87232d82017-05-13 21:10:13 -0700649static void U_CALLCONV
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000650ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
651 UConverterToUnicodeArgs *pToUArgs,
652 UErrorCode *pErrorCode) {
653 UConverter *utf8;
654 const uint8_t *source, *sourceLimit;
655 uint8_t *target;
656 int32_t targetCapacity;
657 int32_t count;
658
659 int8_t oldToULength, toULength, toULimit;
660
661 UChar32 c;
662 uint8_t b, t1, t2;
663
664 /* set up the local pointers */
665 utf8=pToUArgs->converter;
666 source=(uint8_t *)pToUArgs->source;
667 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
668 target=(uint8_t *)pFromUArgs->target;
669 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
670
671 /* get the converter state from the UTF-8 UConverter */
672 c=(UChar32)utf8->toUnicodeStatus;
673 if(c!=0) {
674 toULength=oldToULength=utf8->toULength;
675 toULimit=(int8_t)utf8->mode;
676 } else {
677 toULength=oldToULength=toULimit=0;
678 }
679
680 count=(int32_t)(sourceLimit-source)+oldToULength;
681 if(count<toULimit) {
682 /*
683 * Not enough input to complete the partial character.
684 * Jump to moreBytes below - it will not output to target.
685 */
686 } else if(targetCapacity<toULimit) {
687 /*
688 * Not enough target capacity to output the partial character.
689 * Let the standard converter handle this.
690 */
691 *pErrorCode=U_USING_DEFAULT_WARNING;
692 return;
693 } else {
Jungshik Shinb3189662017-11-07 11:18:34 -0800694 // Use a single counter for source and target, counting the minimum of
695 // the source length and the target capacity.
696 // Let the standard converter handle edge cases.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000697 if(count>targetCapacity) {
698 count=targetCapacity;
699 }
700
Jungshik Shinb3189662017-11-07 11:18:34 -0800701 // The conversion loop checks count>0 only once per 1/2/3-byte character.
702 // If the buffer ends with a truncated 2- or 3-byte sequence,
703 // then we reduce the count to stop before that,
704 // and collect the remaining bytes after the conversion loop.
705 {
706 // Do not go back into the bytes that will be read for finishing a partial
707 // sequence from the previous buffer.
708 int32_t length=count-toULimit;
709 if(length>0) {
710 uint8_t b1=*(sourceLimit-1);
711 if(U8_IS_SINGLE(b1)) {
712 // common ASCII character
713 } else if(U8_IS_TRAIL(b1) && length>=2) {
714 uint8_t b2=*(sourceLimit-2);
715 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
716 // truncated 3-byte sequence
717 count-=2;
718 }
719 } else if(0xc2<=b1 && b1<0xf0) {
720 // truncated 2- or 3-byte sequence
721 --count;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000722 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000723 }
724 }
725 }
726
727 if(c!=0) {
728 utf8->toUnicodeStatus=0;
729 utf8->toULength=0;
730 goto moreBytes;
731 /* See note in ucnv_SBCSFromUTF8() about this goto. */
732 }
733
734 /* conversion loop */
735 while(count>0) {
736 b=*source++;
Jungshik Shinb3189662017-11-07 11:18:34 -0800737 if(U8_IS_SINGLE(b)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000738 /* convert ASCII */
739 *target++=b;
740 --count;
741 continue;
742 } else {
Jungshik Shinb3189662017-11-07 11:18:34 -0800743 if(b>=0xe0) {
744 if( /* handle U+0800..U+FFFF inline */
745 b<0xf0 &&
746 U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
747 U8_IS_TRAIL(t2=source[1])
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000748 ) {
749 source+=2;
750 *target++=b;
751 *target++=t1;
752 *target++=t2;
753 count-=3;
754 continue;
755 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800756 } else {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000757 if( /* handle U+0080..U+07FF inline */
758 b>=0xc2 &&
Jungshik Shinb3189662017-11-07 11:18:34 -0800759 U8_IS_TRAIL(t1=*source)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000760 ) {
761 ++source;
762 *target++=b;
763 *target++=t1;
764 count-=2;
765 continue;
766 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000767 }
768
769 /* handle "complicated" and error cases, and continuing partial characters */
770 oldToULength=0;
771 toULength=1;
Jungshik Shinb3189662017-11-07 11:18:34 -0800772 toULimit=U8_COUNT_BYTES_NON_ASCII(b);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000773 c=b;
774moreBytes:
775 while(toULength<toULimit) {
776 if(source<sourceLimit) {
777 b=*source;
Jungshik Shinb3189662017-11-07 11:18:34 -0800778 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000779 ++source;
780 ++toULength;
781 c=(c<<6)+b;
782 } else {
783 break; /* sequence too short, stop with toULength<toULimit */
784 }
785 } else {
786 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
787 source-=(toULength-oldToULength);
788 while(oldToULength<toULength) {
789 utf8->toUBytes[oldToULength++]=*source++;
790 }
791 utf8->toUnicodeStatus=c;
792 utf8->toULength=toULength;
793 utf8->mode=toULimit;
794 pToUArgs->source=(char *)source;
795 pFromUArgs->target=(char *)target;
796 return;
797 }
798 }
799
Jungshik Shinb3189662017-11-07 11:18:34 -0800800 if(toULength!=toULimit) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000801 /* error handling: illegal UTF-8 byte sequence */
802 source-=(toULength-oldToULength);
803 while(oldToULength<toULength) {
804 utf8->toUBytes[oldToULength++]=*source++;
805 }
806 utf8->toULength=toULength;
807 pToUArgs->source=(char *)source;
808 pFromUArgs->target=(char *)target;
809 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
810 return;
811 }
812
813 /* copy the legal byte sequence to the target */
814 {
815 int8_t i;
816
817 for(i=0; i<oldToULength; ++i) {
818 *target++=utf8->toUBytes[i];
819 }
820 source-=(toULength-oldToULength);
821 for(; i<toULength; ++i) {
822 *target++=*source++;
823 }
824 count-=toULength;
825 }
826 }
827 }
828
829 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
830 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
831 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
832 } else {
833 b=*source;
Jungshik Shinb3189662017-11-07 11:18:34 -0800834 toULimit=U8_COUNT_BYTES(b);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000835 if(toULimit>(sourceLimit-source)) {
836 /* collect a truncated byte sequence */
837 toULength=0;
838 c=b;
839 for(;;) {
840 utf8->toUBytes[toULength++]=b;
841 if(++source==sourceLimit) {
842 /* partial byte sequence at end of source */
843 utf8->toUnicodeStatus=c;
844 utf8->toULength=toULength;
845 utf8->mode=toULimit;
846 break;
847 } else if(!U8_IS_TRAIL(b=*source)) {
848 /* lead byte in trail byte position */
849 utf8->toULength=toULength;
850 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
851 break;
852 }
853 c=(c<<6)+b;
854 }
855 } else {
856 /* partial-sequence target overflow: fall back to the pivoting implementation */
857 *pErrorCode=U_USING_DEFAULT_WARNING;
858 }
859 }
860 }
861
862 /* write back the updated pointers */
863 pToUArgs->source=(char *)source;
864 pFromUArgs->target=(char *)target;
865}
866
Jungshik Shin87232d82017-05-13 21:10:13 -0700867U_CDECL_END
868
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000869/* UTF-8 converter data ----------------------------------------------------- */
870
871static const UConverterImpl _UTF8Impl={
872 UCNV_UTF8,
873
874 NULL,
875 NULL,
876
877 NULL,
878 NULL,
879 NULL,
880
881 ucnv_toUnicode_UTF8,
882 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
883 ucnv_fromUnicode_UTF8,
884 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
885 ucnv_getNextUChar_UTF8,
886
887 NULL,
888 NULL,
889 NULL,
890 NULL,
891 ucnv_getNonSurrogateUnicodeSet,
892
893 ucnv_UTF8FromUTF8,
894 ucnv_UTF8FromUTF8
895};
896
897/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
898static const UConverterStaticData _UTF8StaticData={
899 sizeof(UConverterStaticData),
900 "UTF-8",
901 1208, UCNV_IBM, UCNV_UTF8,
902 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
903 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
904 0,
905 0,
906 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
907};
908
909
Jungshik Shina05f4122015-06-09 15:33:54 -0700910const UConverterSharedData _UTF8Data=
911 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000912
913/* CESU-8 converter data ---------------------------------------------------- */
914
915static const UConverterImpl _CESU8Impl={
916 UCNV_CESU8,
917
918 NULL,
919 NULL,
920
921 NULL,
922 NULL,
923 NULL,
924
925 ucnv_toUnicode_UTF8,
926 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
927 ucnv_fromUnicode_UTF8,
928 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
929 NULL,
930
931 NULL,
932 NULL,
933 NULL,
934 NULL,
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700935 ucnv_getCompleteUnicodeSet,
936
937 NULL,
938 NULL
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000939};
940
941static const UConverterStaticData _CESU8StaticData={
942 sizeof(UConverterStaticData),
943 "CESU-8",
944 9400, /* CCSID for CESU-8 */
945 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
946 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
947 0,
948 0,
949 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
950};
951
952
Jungshik Shina05f4122015-06-09 15:33:54 -0700953const UConverterSharedData _CESU8Data=
954 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000955
956#endif