blob: 04bf913229629a9aafb1f6137764cf737ffd43c4 [file] [log] [blame]
Thiago Macieira17d42a12017-02-26 14:42:37 -08001/****************************************************************************
2**
3** Copyright (C) 2017 Intel Corporation
4**
5** Permission is hereby granted, free of charge, to any person obtaining a copy
6** of this software and associated documentation files (the "Software"), to deal
7** in the Software without restriction, including without limitation the rights
8** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9** copies of the Software, and to permit persons to whom the Software is
10** furnished to do so, subject to the following conditions:
11**
12** The above copyright notice and this permission notice shall be included in
13** all copies or substantial portions of the Software.
14**
15** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21** THE SOFTWARE.
22**
23****************************************************************************/
24
25#include "compilersupport_p.h"
26
27#include <stdint.h>
28
29static inline uint32_t get_utf8(const uint8_t **buffer, const uint8_t *end)
30{
31 uint32_t uc;
32 ptrdiff_t n = end - *buffer;
33 if (n == 0)
34 return ~0U;
35
36 uc = *(*buffer)++;
37 if (uc < 0x80) {
38 /* single-byte UTF-8 */
39 return uc;
40 }
41
42 /* multi-byte UTF-8, decode it */
43 int charsNeeded;
44 uint32_t min_uc;
45 if (unlikely(uc <= 0xC1))
46 return ~0U;
47 if (uc < 0xE0) {
48 /* two-byte UTF-8 */
49 charsNeeded = 2;
50 min_uc = 0x80;
51 uc &= 0x1f;
52 } else if (uc < 0xF0) {
53 /* three-byte UTF-8 */
54 charsNeeded = 3;
55 min_uc = 0x800;
56 uc &= 0x0f;
57 } else if (uc < 0xF5) {
58 /* four-byte UTF-8 */
59 charsNeeded = 4;
60 min_uc = 0x10000;
61 uc &= 0x07;
62 } else {
63 return ~0U;
64 }
65
66 if (n < charsNeeded - 1)
67 return ~0U;
68
69 /* first continuation character */
70 uint8_t b = *(*buffer)++;
71 if ((b & 0xc0) != 0x80)
72 return ~0U;
73 uc <<= 6;
74 uc |= b & 0x3f;
75
76 if (charsNeeded > 2) {
77 /* second continuation character */
78 b = *(*buffer)++;
79 if ((b & 0xc0) != 0x80)
80 return ~0U;
81 uc <<= 6;
82 uc |= b & 0x3f;
83
84 if (charsNeeded > 3) {
85 /* third continuation character */
86 b = *(*buffer)++;
87 if ((b & 0xc0) != 0x80)
88 return ~0U;
89 uc <<= 6;
90 uc |= b & 0x3f;
91 }
92 }
93
94 /* overlong sequence? surrogate pair? out or range? */
95 if (uc < min_uc || uc - 0xd800U < 2048U || uc > 0x10ffff)
96 return ~0U;
97
98 return uc;
99}