blob: 05ffa473ad95a61bde1cfa6ee6437797ed7e2be3 [file] [log] [blame]
Nigel Tao7a142162020-02-11 10:05:53 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Nigel Tao014b9352020-02-26 10:19:31 +110015pub struct decoder? implements base.token_decoder(
Nigel Taoe39f3cb2020-04-14 23:03:18 +100016 quirks : array[QUIRKS_COUNT] base.bool,
Nigel Tao3a75d972020-03-17 22:04:56 +110017
Nigel Taocd4cbc92020-09-22 22:22:15 +100018 allow_leading_ars : base.bool,
19 allow_leading_ubom : base.bool,
Nigel Tao3a75d972020-03-17 22:04:56 +110020
Nigel Tao791437b2020-03-17 14:14:16 +110021 end_of_data : base.bool,
Nigel Taof3146c22020-03-26 08:47:42 +110022
Nigel Taocd4cbc92020-09-22 22:22:15 +100023 trailer_stop : base.u8,
24
Nigel Taofa50f4d2020-09-21 11:07:36 +100025 // comment_type is set as a side-effect of decode_comment?.
Nigel Taocd4cbc92020-09-22 22:22:15 +100026 // - 0 means no comment.
Nigel Taofa50f4d2020-09-21 11:07:36 +100027 // - 1 means a block comment.
28 // - 2 means a line comment.
29 comment_type : base.u8,
30
Nigel Taof3146c22020-03-26 08:47:42 +110031 util : base.utility,
Nigel Taoea91e5a2020-02-13 12:52:53 +110032)(
33 // stack is conceptually an array of bits, implemented as an array of u32.
34 // The N'th bit being 0 or 1 means that we're in an array or object, where
35 // N is the recursion depth.
36 //
37 // Parsing JSON involves recursion: containers (arrays and objects) can
38 // hold other containers. As child elements are completed, the parser needs
39 // to remember 1 bit of state per recursion depth: whether the parent
40 // container was an array or an object. When continuing to parse the
41 // parent's elements, `, "key": value` is only valid for objects.
42 //
43 // Note that we explicitly track our own stack and depth. We do not use the
44 // call stack to hold this state and the decoder.decode_tokens function is
45 // not recursive per se.
46 //
47 // Wuffs code does not have the capability to dynamically allocate memory,
48 // so the maximum depth is hard-coded at compile time. In this case, the
Nigel Tao21682f42020-02-29 23:11:33 +110049 // maximum is 1024 (stack is 1024 bits or 128 bytes), also known as
Nigel Tao74871342020-04-13 15:52:36 +100050 // DECODER_DEPTH_MAX_INCL.
Nigel Taoea91e5a2020-02-13 12:52:53 +110051 //
Nigel Tao07d99dc2020-03-12 16:58:38 +110052 // The [JSON spec](https://www.ietf.org/rfc/rfc8259.txt) clearly states,
53 // "an implementation may set limits on the maximum depth of nesting".
Nigel Taoea91e5a2020-02-13 12:52:53 +110054 //
55 // In comparison, as of February 2020, the Chromium web browser's JSON
56 // parser's maximum recursion depth is 200:
57 // https://source.chromium.org/chromium/chromium/src/+/3dece34cde622faa0daac07156c25d92c9897d1e:base/json/json_common.h;l=18
58 //
59 // Other languages and libraries' maximum depths (determined empirically)
60 // are listed at https://github.com/lovasoa/bad_json_parsers#results
61 stack : array[1024 / 32] base.u32,
Nigel Tao7a142162020-02-11 10:05:53 +110062)
63
Nigel Tao3a75d972020-03-17 22:04:56 +110064pub func decoder.set_quirk_enabled!(quirk: base.u32, enabled: base.bool) {
Nigel Taoe39f3cb2020-04-14 23:03:18 +100065 if args.quirk >= QUIRKS_BASE {
66 args.quirk -= QUIRKS_BASE
67 if args.quirk < QUIRKS_COUNT {
68 this.quirks[args.quirk] = args.enabled
69 }
Nigel Tao3a75d972020-03-17 22:04:56 +110070 }
71}
72
Nigel Taof3146c22020-03-26 08:47:42 +110073pub func decoder.workbuf_len() base.range_ii_u64 {
74 return this.util.empty_range_ii_u64()
75}
76
77pub func decoder.decode_tokens?(dst: base.token_writer, src: base.io_reader, workbuf: slice base.u8) {
Nigel Tao88a449e2020-04-04 11:41:09 +110078 // This is a very, very long function, and it is tempting to refactor it.
79 // Be careful of performance impacts when doing so. For example, commit
80 // 86d3b89f "Factor out json.decoder.decode_string" pulled out a 500 line
81 // decode_string function, which was certainly cleaner structurally, but
82 // also regressed performance by 1.1x to 1.2x. For details, see
83 // https://github.com/google/wuffs/commit/86d3b89f9a6578d964a4b6d71e21dfc9bb702b44
84
Nigel Taod1c928a2020-02-28 12:43:53 +110085 var vminor : base.u32[..= 0xFF_FFFF]
Nigel Tao84bb3af2020-07-07 23:29:30 +100086 var number_length : base.u32[..= 0x3FF]
Nigel Tao9d35cf02020-02-19 21:36:23 +110087 var number_status : base.u32[..= 0x3]
Nigel Tao695a72a2020-04-04 11:22:48 +110088 var string_length : base.u32[..= 0xFFFB]
Nigel Taoea91e5a2020-02-13 12:52:53 +110089 var whitespace_length : base.u32[..= 0xFFFE]
90 var depth : base.u32[..= 1024]
91 var stack_byte : base.u32[..= (1024 / 32) - 1]
92 var stack_bit : base.u32[..= 31]
Nigel Taoc92c0f52020-02-21 11:16:27 +110093 var match : base.u32[..= 2]
Nigel Tao695a72a2020-04-04 11:22:48 +110094 var c4 : base.u32
Nigel Taoea91e5a2020-02-13 12:52:53 +110095 var c : base.u8
Nigel Tao695a72a2020-04-04 11:22:48 +110096 var backslash : base.u8
97 var char : base.u8
Nigel Tao3b17d5b2020-02-26 12:53:10 +110098 var class : base.u8[..= 0x0F]
Nigel Tao695a72a2020-04-04 11:22:48 +110099 var multi_byte_utf8 : base.u32
100
Nigel Tao695a72a2020-04-04 11:22:48 +1100101 var backslash_x_ok : base.u8
Nigel Tao1adbc952020-08-06 23:28:07 +1000102 var backslash_x_value : base.u8
Nigel Tao695a72a2020-04-04 11:22:48 +1100103 var backslash_x_string : base.u32
104
105 var uni4_ok : base.u8
106 var uni4_string : base.u64
107 var uni4_value : base.u32[..= 0xFFFF]
108 var uni4_high_surrogate : base.u32[..= 0x10_FC00]
109
110 var uni8_ok : base.u8
111 var uni8_string : base.u64
112 var uni8_value : base.u32[..= 0xFFFF_FFFF]
Nigel Tao83e7da02020-03-19 23:20:45 +1100113
Nigel Taod1a4abe2020-02-22 09:03:07 +1100114 // expect is a bitmask of what the next character class can be.
Nigel Taoea91e5a2020-02-13 12:52:53 +1100115 //
Nigel Taod1a4abe2020-02-22 09:03:07 +1100116 // expect_after_value is what to expect after seeing a value (a literal,
117 // number, string, array or object). For depth 0, this is ignored.
118 // Otherwise, it should be (EXPECT_CLOSE_FOO | EXPECT_COMMA), for some
119 // value of FOO.
120 var expect : base.u32
121 var expect_after_value : base.u32
Nigel Taoea91e5a2020-02-13 12:52:53 +1100122
Nigel Taoecd32852020-04-03 12:35:18 +1100123 if this.end_of_data {
Nigel Tao791437b2020-03-17 14:14:16 +1100124 return base."@end of data"
Nigel Taoecd32852020-04-03 12:35:18 +1100125 }
Nigel Tao791437b2020-03-17 14:14:16 +1100126
Nigel Taocd4cbc92020-09-22 22:22:15 +1000127 if this.quirks[QUIRK_EXPECT_TRAILING_NEW_LINE_OR_EOF - QUIRKS_BASE] {
128 if this.quirks[QUIRK_ALLOW_COMMENT_BLOCK - QUIRKS_BASE] or
129 this.quirks[QUIRK_ALLOW_COMMENT_LINE - QUIRKS_BASE] or
130 this.quirks[QUIRK_ALLOW_TRAILING_FILLER - QUIRKS_BASE] {
131 return "#bad quirk combination"
132 }
133 }
134
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000135 if this.quirks[QUIRK_ALLOW_LEADING_ASCII_RECORD_SEPARATOR - QUIRKS_BASE] or
136 this.quirks[QUIRK_ALLOW_LEADING_UNICODE_BYTE_ORDER_MARK - QUIRKS_BASE] {
Nigel Tao3a75d972020-03-17 22:04:56 +1100137 this.decode_leading?(dst: args.dst, src: args.src)
138 }
139
Nigel Tao66b0a122020-06-09 23:48:54 +1000140 expect = EXPECT_VALUE
Nigel Taoea91e5a2020-02-13 12:52:53 +1100141
142 while.outer true {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100143 while.goto_parsed_a_leaf_value true {{
Nigel Taobe8542e2020-08-13 23:26:08 +1000144 if args.dst.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100145 yield? base."$short write"
146 continue.outer
147 }
Nigel Taoea91e5a2020-02-13 12:52:53 +1100148
Nigel Tao3fe3f842020-04-02 21:56:53 +1100149 // Consume whitespace.
150 whitespace_length = 0
151 c = 0
152 class = 0
153 while.ws true,
Nigel Taobe8542e2020-08-13 23:26:08 +1000154 inv args.dst.length() > 0,
155 post args.src.length() > 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100156 {
Nigel Taobe8542e2020-08-13 23:26:08 +1000157 if args.src.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100158 if whitespace_length > 0 {
Nigel Tao3456f392020-04-02 12:26:57 +1100159 args.dst.write_simple_token_fast!(
Nigel Taod1c928a2020-02-28 12:43:53 +1100160 value_major: 0,
161 value_minor: 0,
Nigel Tao496e88b2020-04-09 22:10:08 +1000162 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100163 length: whitespace_length)
Nigel Tao93b09672020-02-21 11:42:33 +1100164 whitespace_length = 0
Nigel Tao93b09672020-02-21 11:42:33 +1100165 }
Nigel Tao3fe3f842020-04-02 21:56:53 +1100166 if args.src.is_closed() {
167 return "#bad input"
168 }
169 yield? base."$short read"
Nigel Tao3fe3f842020-04-02 21:56:53 +1100170 continue.outer
171 }
Nigel Tao93b09672020-02-21 11:42:33 +1100172
Nigel Tao3fe3f842020-04-02 21:56:53 +1100173 c = args.src.peek_u8()
Nigel Tao74871342020-04-13 15:52:36 +1000174 class = LUT_CLASSES[c]
Nigel Tao80071732020-04-13 16:06:16 +1000175 if class <> CLASS_WHITESPACE {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100176 break.ws
177 }
Nigel Tao8b70ad02020-05-27 23:28:44 +1000178 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3fe3f842020-04-02 21:56:53 +1100179
180 if whitespace_length >= 0xFFFE {
Nigel Tao3456f392020-04-02 12:26:57 +1100181 args.dst.write_simple_token_fast!(
Nigel Taod1c928a2020-02-28 12:43:53 +1100182 value_major: 0,
183 value_minor: 0,
Nigel Tao496e88b2020-04-09 22:10:08 +1000184 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100185 length: 0xFFFF)
Nigel Tao93b09672020-02-21 11:42:33 +1100186 whitespace_length = 0
Nigel Tao3fe3f842020-04-02 21:56:53 +1100187 continue.outer
188 }
189 whitespace_length += 1
190 } endwhile.ws
191
192 // Emit whitespace.
193 if whitespace_length > 0 {
194 args.dst.write_simple_token_fast!(
195 value_major: 0,
196 value_minor: 0,
Nigel Tao496e88b2020-04-09 22:10:08 +1000197 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100198 length: whitespace_length)
199 whitespace_length = 0
Nigel Taobe8542e2020-08-13 23:26:08 +1000200 if args.dst.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100201 continue.outer
202 }
203 }
204
205 // Check expected character classes.
206 if 0 == (expect & ((1 as base.u32) << class)) {
207 return "#bad input"
208 }
209
210 // These assertions are redundant (the Wuffs compiler should already
211 // know these facts; deleting these assertions should still compile)
212 // but are listed explicitly to guard against future edits to the code
213 // above inadvertently invalidating these assertions.
Nigel Taobe8542e2020-08-13 23:26:08 +1000214 assert args.dst.length() > 0
215 assert args.src.length() > 0
Nigel Tao3fe3f842020-04-02 21:56:53 +1100216
Nigel Tao80071732020-04-13 16:06:16 +1000217 if class == CLASS_STRING {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100218 // -------- BEGIN parse strings.
219 // Emit the leading '"'.
220 args.dst.write_simple_token_fast!(
221 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000222 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000223 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000224 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000225 base.TOKEN__VBD__STRING__DEFINITELY_ASCII |
226 base.TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP,
Nigel Tao496e88b2020-04-09 22:10:08 +1000227 continued: 1,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100228 length: 1)
Nigel Tao8b70ad02020-05-27 23:28:44 +1000229 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3fe3f842020-04-02 21:56:53 +1100230
Nigel Tao695a72a2020-04-04 11:22:48 +1100231 while.string_loop_outer true {
Nigel Taobe8542e2020-08-13 23:26:08 +1000232 if args.dst.length() <= 0 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100233 yield? base."$short write"
234 continue.string_loop_outer
235 }
236
237 string_length = 0
238 while.string_loop_inner true,
Nigel Taobe8542e2020-08-13 23:26:08 +1000239 pre args.dst.length() > 0,
Nigel Tao695a72a2020-04-04 11:22:48 +1100240 {
Nigel Taobe8542e2020-08-13 23:26:08 +1000241 if args.src.length() <= 0 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100242 if string_length > 0 {
243 args.dst.write_simple_token_fast!(
244 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000245 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000246 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000247 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000248 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000249 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100250 length: string_length)
251 string_length = 0
252 }
253 if args.src.is_closed() {
254 return "#bad input"
255 }
256 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100257 continue.string_loop_outer
258 }
259
260 // As an optimization, consume non-special ASCII 4 bytes at
261 // a time.
Nigel Taobe8542e2020-08-13 23:26:08 +1000262 while args.src.length() > 4,
263 inv args.dst.length() > 0,
264 inv args.src.length() > 0,
Nigel Tao695a72a2020-04-04 11:22:48 +1100265 {
266 c4 = args.src.peek_u32le()
Nigel Tao74871342020-04-13 15:52:36 +1000267 if 0x00 <> (LUT_CHARS[0xFF & (c4 >> 0)] |
268 LUT_CHARS[0xFF & (c4 >> 8)] |
269 LUT_CHARS[0xFF & (c4 >> 16)] |
270 LUT_CHARS[0xFF & (c4 >> 24)]) {
Nigel Tao695a72a2020-04-04 11:22:48 +1100271 break
272 }
Nigel Tao8b70ad02020-05-27 23:28:44 +1000273 args.src.skip_u32_fast!(actual: 4, worst_case: 4)
Nigel Tao695a72a2020-04-04 11:22:48 +1100274 if string_length > (0xFFFB - 4) {
275 args.dst.write_simple_token_fast!(
276 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000277 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000278 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000279 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000280 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000281 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100282 length: string_length + 4)
283 string_length = 0
284 continue.string_loop_outer
285 }
286 string_length += 4
287 } endwhile
288
289 c = args.src.peek_u8()
Nigel Tao74871342020-04-13 15:52:36 +1000290 char = LUT_CHARS[c]
Nigel Tao695a72a2020-04-04 11:22:48 +1100291
292 if char == 0x00 { // Non-special ASCII.
Nigel Tao8b70ad02020-05-27 23:28:44 +1000293 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao695a72a2020-04-04 11:22:48 +1100294 if string_length >= 0xFFFB {
295 args.dst.write_simple_token_fast!(
296 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000297 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000298 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000299 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000300 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000301 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100302 length: 0xFFFC)
303 string_length = 0
304 continue.string_loop_outer
305 }
306 string_length += 1
307 continue.string_loop_inner
308
309 } else if char == 0x01 { // '"'
310 if string_length <> 0 {
311 args.dst.write_simple_token_fast!(
312 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000313 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000314 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000315 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000316 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000317 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100318 length: string_length)
319 string_length = 0
320 }
321 break.string_loop_outer
322
323 } else if char == 0x02 { // '\\'.
324 if string_length > 0 {
325 args.dst.write_simple_token_fast!(
326 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000327 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000328 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000329 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000330 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000331 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100332 length: string_length)
333 string_length = 0
Nigel Taobe8542e2020-08-13 23:26:08 +1000334 if args.dst.length() <= 0 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100335 continue.string_loop_outer
336 }
337 }
Nigel Taobe8542e2020-08-13 23:26:08 +1000338 assert args.dst.length() > 0
Nigel Tao695a72a2020-04-04 11:22:48 +1100339
Nigel Taobe8542e2020-08-13 23:26:08 +1000340 if args.src.length() < 2 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100341 if args.src.is_closed() {
342 return "#bad backslash-escape"
343 }
344 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100345 continue.string_loop_outer
346 }
347 c = (args.src.peek_u16le() >> 8) as base.u8
Nigel Tao74871342020-04-13 15:52:36 +1000348 backslash = LUT_BACKSLASHES[c]
Nigel Tao695a72a2020-04-04 11:22:48 +1100349 if (backslash & 0x80) <> 0 {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000350 args.src.skip_u32_fast!(actual: 2, worst_case: 2)
Nigel Tao695a72a2020-04-04 11:22:48 +1100351 args.dst.write_simple_token_fast!(
352 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000353 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
354 ((backslash & 0x7F) as base.u32),
Nigel Tao496e88b2020-04-09 22:10:08 +1000355 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100356 length: 2)
357 continue.string_loop_outer
358
359 } else if backslash <> 0 {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000360 if this.quirks[LUT_QUIRKY_BACKSLASHES_QUIRKS[backslash & 7]] {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000361 args.src.skip_u32_fast!(actual: 2, worst_case: 2)
Nigel Tao695a72a2020-04-04 11:22:48 +1100362 args.dst.write_simple_token_fast!(
363 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000364 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
365 (LUT_QUIRKY_BACKSLASHES_CHARS[backslash & 7] as base.u32),
Nigel Tao496e88b2020-04-09 22:10:08 +1000366 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100367 length: 2)
368 continue.string_loop_outer
369 }
370
371 } else if c == 'u' {
372 // -------- BEGIN backslash-u.
Nigel Taobe8542e2020-08-13 23:26:08 +1000373 if args.src.length() < 6 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100374 if args.src.is_closed() {
375 return "#bad backslash-escape"
376 }
377 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100378 continue.string_loop_outer
379 }
380
381 uni4_string = args.src.peek_u48le_as_u64() >> 16
382 uni4_value = 0
383 uni4_ok = 0x80
384
Nigel Tao74871342020-04-13 15:52:36 +1000385 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 0)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100386 uni4_ok &= c
387 uni4_value |= ((c & 0x0F) as base.u32) << 12
Nigel Tao74871342020-04-13 15:52:36 +1000388 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 8)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100389 uni4_ok &= c
390 uni4_value |= ((c & 0x0F) as base.u32) << 8
Nigel Tao74871342020-04-13 15:52:36 +1000391 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 16)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100392 uni4_ok &= c
393 uni4_value |= ((c & 0x0F) as base.u32) << 4
Nigel Tao74871342020-04-13 15:52:36 +1000394 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 24)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100395 uni4_ok &= c
396 uni4_value |= ((c & 0x0F) as base.u32) << 0
397
398 if uni4_ok == 0 {
399 // It wasn't 4 hexadecimal digits. No-op (and
400 // fall through to "#bad backslash-escape").
401
402 } else if (uni4_value < 0xD800) or (0xDFFF < uni4_value) {
403 // Not a Unicode surrogate. We're good.
Nigel Tao8b70ad02020-05-27 23:28:44 +1000404 args.src.skip_u32_fast!(actual: 6, worst_case: 6)
Nigel Tao695a72a2020-04-04 11:22:48 +1100405 args.dst.write_simple_token_fast!(
406 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000407 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
408 uni4_value,
Nigel Tao496e88b2020-04-09 22:10:08 +1000409 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100410 length: 6)
411 continue.string_loop_outer
412
413 } else if uni4_value >= 0xDC00 {
414 // Low surrogate. No-op (and fall through to
415 // "#bad backslash-escape").
416
417 } else {
418 // High surrogate, which needs to be followed
419 // by a "\\u1234" low surrogate. We've already
420 // peeked 6 bytes for the high surrogate. We
421 // need 12 in total: another 8 bytes at an
422 // offset of 4.
Nigel Taobe8542e2020-08-13 23:26:08 +1000423 if args.src.length() < 12 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100424 if args.src.is_closed() {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000425 if this.quirks[QUIRK_REPLACE_INVALID_UNICODE - QUIRKS_BASE] {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000426 args.src.skip_u32_fast!(actual: 6, worst_case: 6)
Nigel Tao695a72a2020-04-04 11:22:48 +1100427 args.dst.write_simple_token_fast!(
428 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000429 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
430 base.UNICODE__REPLACEMENT_CHARACTER,
Nigel Tao496e88b2020-04-09 22:10:08 +1000431 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100432 length: 6)
433 continue.string_loop_outer
434 }
435 return "#bad backslash-escape"
436 }
437 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100438 continue.string_loop_outer
439 }
440 uni4_string = args.src.peek_u64le_at(offset: 4) >> 16
441
442 // Look for the low surrogate's "\\u".
443 if ((0xFF & (uni4_string >> 0)) <> '\\') or
444 ((0xFF & (uni4_string >> 8)) <> 'u') {
445 uni4_high_surrogate = 0
446 uni4_value = 0
447 uni4_ok = 0
448 } else {
449 uni4_high_surrogate =
450 0x1_0000 + ((uni4_value - 0xD800) << 10)
451 uni4_value = 0
452 uni4_ok = 0x80
453 uni4_string >>= 16
454
Nigel Tao74871342020-04-13 15:52:36 +1000455 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 0)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100456 uni4_ok &= c
457 uni4_value |= ((c & 0x0F) as base.u32) << 12
Nigel Tao74871342020-04-13 15:52:36 +1000458 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 8)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100459 uni4_ok &= c
460 uni4_value |= ((c & 0x0F) as base.u32) << 8
Nigel Tao74871342020-04-13 15:52:36 +1000461 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 16)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100462 uni4_ok &= c
463 uni4_value |= ((c & 0x0F) as base.u32) << 4
Nigel Tao74871342020-04-13 15:52:36 +1000464 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni4_string >> 24)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100465 uni4_ok &= c
466 uni4_value |= ((c & 0x0F) as base.u32) << 0
467 }
468
469 if (uni4_ok <> 0) and
470 (0xDC00 <= uni4_value) and (uni4_value <= 0xDFFF) {
471
472 // Emit a single token for the surrogate
473 // pair.
474 uni4_value -= 0xDC00
Nigel Tao8b70ad02020-05-27 23:28:44 +1000475 args.src.skip_u32_fast!(actual: 12, worst_case: 12)
Nigel Tao695a72a2020-04-04 11:22:48 +1100476 args.dst.write_simple_token_fast!(
477 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000478 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
479 uni4_high_surrogate | uni4_value,
Nigel Tao496e88b2020-04-09 22:10:08 +1000480 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100481 length: 12)
482 continue.string_loop_outer
483 }
484 }
485
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000486 if this.quirks[QUIRK_REPLACE_INVALID_UNICODE - QUIRKS_BASE] {
Nigel Taobe8542e2020-08-13 23:26:08 +1000487 if args.src.length() < 6 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100488 return "#internal error: inconsistent I/O"
489 }
Nigel Tao8b70ad02020-05-27 23:28:44 +1000490 args.src.skip_u32_fast!(actual: 6, worst_case: 6)
Nigel Tao695a72a2020-04-04 11:22:48 +1100491 args.dst.write_simple_token_fast!(
492 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000493 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
494 base.UNICODE__REPLACEMENT_CHARACTER,
Nigel Tao496e88b2020-04-09 22:10:08 +1000495 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100496 length: 6)
497 continue.string_loop_outer
498 }
499 // -------- END backslash-u.
500
501 } else if (c == 'U') and
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000502 this.quirks[QUIRK_ALLOW_BACKSLASH_CAPITAL_U - QUIRKS_BASE] {
Nigel Tao695a72a2020-04-04 11:22:48 +1100503 // -------- BEGIN backslash-capital-u.
Nigel Taobe8542e2020-08-13 23:26:08 +1000504 if args.src.length() < 10 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100505 if args.src.is_closed() {
506 return "#bad backslash-escape"
507 }
508 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100509 continue.string_loop_outer
510 }
511 uni8_string = args.src.peek_u64le_at(offset: 2)
512 uni8_value = 0
513 uni8_ok = 0x80
514
Nigel Tao74871342020-04-13 15:52:36 +1000515 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 0)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100516 uni8_ok &= c
517 uni8_value |= ((c & 0x0F) as base.u32) << 28
Nigel Tao74871342020-04-13 15:52:36 +1000518 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 8)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100519 uni8_ok &= c
520 uni8_value |= ((c & 0x0F) as base.u32) << 24
Nigel Tao74871342020-04-13 15:52:36 +1000521 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 16)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100522 uni8_ok &= c
523 uni8_value |= ((c & 0x0F) as base.u32) << 20
Nigel Tao74871342020-04-13 15:52:36 +1000524 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 24)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100525 uni8_ok &= c
526 uni8_value |= ((c & 0x0F) as base.u32) << 16
Nigel Tao74871342020-04-13 15:52:36 +1000527 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 32)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100528 uni8_ok &= c
529 uni8_value |= ((c & 0x0F) as base.u32) << 12
Nigel Tao74871342020-04-13 15:52:36 +1000530 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 40)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100531 uni8_ok &= c
532 uni8_value |= ((c & 0x0F) as base.u32) << 8
Nigel Tao74871342020-04-13 15:52:36 +1000533 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 48)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100534 uni8_ok &= c
535 uni8_value |= ((c & 0x0F) as base.u32) << 4
Nigel Tao74871342020-04-13 15:52:36 +1000536 c = LUT_HEXADECIMAL_DIGITS[0xFF & (uni8_string >> 56)]
Nigel Tao695a72a2020-04-04 11:22:48 +1100537 uni8_ok &= c
538 uni8_value |= ((c & 0x0F) as base.u32) << 0
539
540 if uni8_ok == 0 {
541 // It wasn't 8 hexadecimal digits. No-op (and
542 // fall through to "#bad backslash-escape").
543
544 } else if (uni8_value < 0xD800) or (
545 (0xDFFF < uni8_value) and (uni8_value <= 0x10_FFFF)) {
546 // Not a Unicode surrogate. We're good.
Nigel Tao8b70ad02020-05-27 23:28:44 +1000547 args.src.skip_u32_fast!(actual: 10, worst_case: 10)
Nigel Tao695a72a2020-04-04 11:22:48 +1100548 args.dst.write_simple_token_fast!(
549 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000550 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
551 (uni8_value & 0x1F_FFFF),
Nigel Tao496e88b2020-04-09 22:10:08 +1000552 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100553 length: 10)
554 continue.string_loop_outer
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000555 } else if this.quirks[QUIRK_REPLACE_INVALID_UNICODE - QUIRKS_BASE] {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000556 args.src.skip_u32_fast!(actual: 10, worst_case: 10)
Nigel Tao695a72a2020-04-04 11:22:48 +1100557 args.dst.write_simple_token_fast!(
558 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000559 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
560 base.UNICODE__REPLACEMENT_CHARACTER,
Nigel Tao496e88b2020-04-09 22:10:08 +1000561 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100562 length: 10)
563 continue.string_loop_outer
564 }
565 // -------- END backslash-capital-u.
566
Nigel Tao6c138772020-08-30 16:15:35 +1000567 } else if (c == 'x') and
568 this.quirks[QUIRK_ALLOW_BACKSLASH_X_AS_CODE_POINTS - QUIRKS_BASE] {
Nigel Tao695a72a2020-04-04 11:22:48 +1100569 // -------- BEGIN backslash-x
Nigel Tao6c138772020-08-30 16:15:35 +1000570 if args.src.length() < 4 {
571 if args.src.is_closed() {
Nigel Tao1adbc952020-08-06 23:28:07 +1000572 return "#bad backslash-escape"
Nigel Tao695a72a2020-04-04 11:22:48 +1100573 }
Nigel Tao6c138772020-08-30 16:15:35 +1000574 yield? base."$short read"
Nigel Tao1adbc952020-08-06 23:28:07 +1000575 continue.string_loop_outer
Nigel Tao695a72a2020-04-04 11:22:48 +1100576 }
Nigel Tao6c138772020-08-30 16:15:35 +1000577
578 backslash_x_string = args.src.peek_u32le()
579 backslash_x_ok = 0x80
580
581 c = LUT_HEXADECIMAL_DIGITS[0xFF & (backslash_x_string >> 16)]
582 backslash_x_ok &= c
583 backslash_x_value = ((c & 0x0F) << 4) as base.u8
584 c = LUT_HEXADECIMAL_DIGITS[0xFF & (backslash_x_string >> 24)]
585 backslash_x_ok &= c
586 backslash_x_value = (backslash_x_value | (c & 0x0F)) as base.u8
587
588 if (backslash_x_ok == 0) or
589 ((backslash_x_string & 0xFFFF) <> 0x785C) {
590 // It wasn't "\\x34", for some hexadecimal
591 // digits "34".
592 return "#bad backslash-escape"
593 }
594 args.src.skip_u32_fast!(actual: 4, worst_case: 4)
595 args.dst.write_simple_token_fast!(
596 value_major: 0,
597 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
598 (backslash_x_value as base.u32),
599 continued: 1,
600 length: 4)
601 continue.string_loop_outer
Nigel Tao695a72a2020-04-04 11:22:48 +1100602 // -------- END backslash-x
603 }
604
605 return "#bad backslash-escape"
606
607 } else if char == 0x03 { // 2-byte UTF-8.
Nigel Taobe8542e2020-08-13 23:26:08 +1000608 if args.src.length() < 2 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100609 if string_length > 0 {
610 args.dst.write_simple_token_fast!(
611 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000612 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000613 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000614 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000615 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000616 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100617 length: string_length)
618 string_length = 0
Nigel Taobe8542e2020-08-13 23:26:08 +1000619 if args.dst.length() <= 0 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100620 continue.string_loop_outer
621 }
622 }
623 if args.src.is_closed() {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000624 if this.quirks[QUIRK_REPLACE_INVALID_UNICODE - QUIRKS_BASE] {
Nigel Tao695a72a2020-04-04 11:22:48 +1100625 args.dst.write_simple_token_fast!(
626 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000627 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
628 base.UNICODE__REPLACEMENT_CHARACTER,
Nigel Tao496e88b2020-04-09 22:10:08 +1000629 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100630 length: 1)
Nigel Tao8b70ad02020-05-27 23:28:44 +1000631 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao695a72a2020-04-04 11:22:48 +1100632 continue.string_loop_outer
633 }
634 return "#bad UTF-8"
635 }
636 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100637 continue.string_loop_outer
638 }
639 multi_byte_utf8 = args.src.peek_u16le_as_u32()
640 if (multi_byte_utf8 & 0xC000) == 0x8000 {
641 multi_byte_utf8 = (0x00_07C0 & (multi_byte_utf8 ~mod<< 6)) |
642 (0x00_003F & (multi_byte_utf8 >> 8))
Nigel Tao8b70ad02020-05-27 23:28:44 +1000643 args.src.skip_u32_fast!(actual: 2, worst_case: 2)
Nigel Tao695a72a2020-04-04 11:22:48 +1100644 if string_length >= 0xFFF8 {
645 args.dst.write_simple_token_fast!(
646 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000647 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000648 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000649 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000650 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000651 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100652 length: string_length + 2)
653 string_length = 0
654 continue.string_loop_outer
655 }
656 string_length += 2
657 continue.string_loop_inner
658 }
659
660 } else if char == 0x04 { // 3-byte UTF-8.
Nigel Taobe8542e2020-08-13 23:26:08 +1000661 if args.src.length() < 3 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100662 if string_length > 0 {
663 args.dst.write_simple_token_fast!(
664 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000665 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000666 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000667 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000668 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000669 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100670 length: string_length)
671 string_length = 0
Nigel Taobe8542e2020-08-13 23:26:08 +1000672 if args.dst.length() <= 0 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100673 continue.string_loop_outer
674 }
675 }
676 if args.src.is_closed() {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000677 if this.quirks[QUIRK_REPLACE_INVALID_UNICODE - QUIRKS_BASE] {
Nigel Tao695a72a2020-04-04 11:22:48 +1100678 args.dst.write_simple_token_fast!(
679 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000680 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
681 base.UNICODE__REPLACEMENT_CHARACTER,
Nigel Tao496e88b2020-04-09 22:10:08 +1000682 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100683 length: 1)
Nigel Tao8b70ad02020-05-27 23:28:44 +1000684 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao695a72a2020-04-04 11:22:48 +1100685 continue.string_loop_outer
686 }
687 return "#bad UTF-8"
688 }
689 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100690 continue.string_loop_outer
691 }
692 multi_byte_utf8 = args.src.peek_u24le_as_u32()
693 if (multi_byte_utf8 & 0xC0_C000) == 0x80_8000 {
694 multi_byte_utf8 = (0x00_F000 & (multi_byte_utf8 ~mod<< 12)) |
695 (0x00_0FC0 & (multi_byte_utf8 >> 2)) |
696 (0x00_003F & (multi_byte_utf8 >> 16))
697 if (0x07FF < multi_byte_utf8) and
698 ((multi_byte_utf8 < 0xD800) or (0xDFFF < multi_byte_utf8)) {
699
Nigel Tao8b70ad02020-05-27 23:28:44 +1000700 args.src.skip_u32_fast!(actual: 3, worst_case: 3)
Nigel Tao695a72a2020-04-04 11:22:48 +1100701 if string_length >= 0xFFF8 {
702 args.dst.write_simple_token_fast!(
703 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000704 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000705 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000706 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000707 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000708 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100709 length: string_length + 3)
710 string_length = 0
711 continue.string_loop_outer
712 }
713 string_length += 3
714 continue.string_loop_inner
715 }
716 }
717
718 } else if char == 0x05 { // 4-byte UTF-8.
Nigel Taobe8542e2020-08-13 23:26:08 +1000719 if args.src.length() < 4 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100720 if string_length > 0 {
721 args.dst.write_simple_token_fast!(
722 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000723 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000724 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000725 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000726 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000727 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100728 length: string_length)
729 string_length = 0
Nigel Taobe8542e2020-08-13 23:26:08 +1000730 if args.dst.length() <= 0 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100731 continue.string_loop_outer
732 }
733 }
734 if args.src.is_closed() {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000735 if this.quirks[QUIRK_REPLACE_INVALID_UNICODE - QUIRKS_BASE] {
Nigel Tao695a72a2020-04-04 11:22:48 +1100736 args.dst.write_simple_token_fast!(
737 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000738 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
739 base.UNICODE__REPLACEMENT_CHARACTER,
Nigel Tao496e88b2020-04-09 22:10:08 +1000740 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100741 length: 1)
Nigel Tao8b70ad02020-05-27 23:28:44 +1000742 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao695a72a2020-04-04 11:22:48 +1100743 continue.string_loop_outer
744 }
745 return "#bad UTF-8"
746 }
747 yield? base."$short read"
Nigel Tao695a72a2020-04-04 11:22:48 +1100748 continue.string_loop_outer
749 }
750 multi_byte_utf8 = args.src.peek_u32le()
751 if (multi_byte_utf8 & 0xC0C0_C000) == 0x8080_8000 {
752 multi_byte_utf8 = (0x1C_0000 & (multi_byte_utf8 ~mod<< 18)) |
753 (0x03_F000 & (multi_byte_utf8 ~mod<< 4)) |
754 (0x00_0FC0 & (multi_byte_utf8 >> 10)) |
755 (0x00_003F & (multi_byte_utf8 >> 24))
756 if (0xFFFF < multi_byte_utf8) and (multi_byte_utf8 <= 0x10_FFFF) {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000757 args.src.skip_u32_fast!(actual: 4, worst_case: 4)
Nigel Tao695a72a2020-04-04 11:22:48 +1100758 if string_length >= 0xFFF8 {
759 args.dst.write_simple_token_fast!(
760 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000761 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000762 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000763 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000764 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000765 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100766 length: string_length + 4)
767 string_length = 0
768 continue.string_loop_outer
769 }
770 string_length += 4
771 continue.string_loop_inner
772 }
773 }
774 }
775
776 if string_length > 0 {
777 args.dst.write_simple_token_fast!(
778 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000779 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000780 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000781 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000782 base.TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY,
Nigel Tao496e88b2020-04-09 22:10:08 +1000783 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100784 length: string_length)
785 string_length = 0
Nigel Taobe8542e2020-08-13 23:26:08 +1000786 if args.dst.length() <= 0 {
Nigel Tao695a72a2020-04-04 11:22:48 +1100787 continue.string_loop_outer
788 }
789 }
Nigel Taod83bd8d2020-04-07 17:50:47 +1000790 if (char & 0x80) <> 0 {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000791 if this.quirks[QUIRK_ALLOW_ASCII_CONTROL_CODES - QUIRKS_BASE] {
Nigel Taod83bd8d2020-04-07 17:50:47 +1000792 args.dst.write_simple_token_fast!(
793 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000794 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
795 ((char & 0x7F) as base.u32),
Nigel Tao496e88b2020-04-09 22:10:08 +1000796 continued: 1,
Nigel Taod83bd8d2020-04-07 17:50:47 +1000797 length: 1)
Nigel Tao8b70ad02020-05-27 23:28:44 +1000798 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Taod83bd8d2020-04-07 17:50:47 +1000799 continue.string_loop_outer
800 }
Nigel Tao6bc3f572020-08-29 23:23:15 +1000801 if char == 0x8A {
802 return "#bad new-line in a string"
803 }
Nigel Tao695a72a2020-04-04 11:22:48 +1100804 return "#bad C0 control code"
805 }
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000806 if this.quirks[QUIRK_REPLACE_INVALID_UNICODE - QUIRKS_BASE] {
Nigel Tao695a72a2020-04-04 11:22:48 +1100807 args.dst.write_simple_token_fast!(
808 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000809 value_minor: (base.TOKEN__VBC__UNICODE_CODE_POINT << 21) |
810 base.UNICODE__REPLACEMENT_CHARACTER,
Nigel Tao496e88b2020-04-09 22:10:08 +1000811 continued: 1,
Nigel Tao695a72a2020-04-04 11:22:48 +1100812 length: 1)
Nigel Tao8b70ad02020-05-27 23:28:44 +1000813 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao695a72a2020-04-04 11:22:48 +1100814 continue.string_loop_outer
815 }
816 return "#bad UTF-8"
817 } endwhile.string_loop_inner
818 } endwhile.string_loop_outer
Nigel Taof6843712020-02-22 08:17:03 +1100819
Nigel Tao3fe3f842020-04-02 21:56:53 +1100820 // Emit the trailing '"'.
821 while true {
Nigel Taobe8542e2020-08-13 23:26:08 +1000822 if args.src.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100823 if args.src.is_closed() {
Nigel Taof6843712020-02-22 08:17:03 +1100824 return "#bad input"
Nigel Taof6843712020-02-22 08:17:03 +1100825 }
Nigel Tao3fe3f842020-04-02 21:56:53 +1100826 yield? base."$short read"
827 continue
Nigel Tao01bab822020-02-23 09:03:10 +1100828 }
Nigel Taobe8542e2020-08-13 23:26:08 +1000829 if args.dst.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100830 yield? base."$short write"
831 continue
Nigel Taof6843712020-02-22 08:17:03 +1100832 }
Nigel Tao8b70ad02020-05-27 23:28:44 +1000833 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3456f392020-04-02 12:26:57 +1100834 args.dst.write_simple_token_fast!(
Nigel Taod1c928a2020-02-28 12:43:53 +1100835 value_major: 0,
Nigel Tao6c138772020-08-30 16:15:35 +1000836 value_minor: (base.TOKEN__VBC__STRING << 21) |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000837 base.TOKEN__VBD__STRING__DEFINITELY_UTF_8 |
Nigel Tao6c138772020-08-30 16:15:35 +1000838 base.TOKEN__VBD__STRING__CHAIN_MUST_BE_UTF_8 |
Nigel Tao3d31ac02020-06-09 23:34:42 +1000839 base.TOKEN__VBD__STRING__DEFINITELY_ASCII |
840 base.TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP,
Nigel Tao496e88b2020-04-09 22:10:08 +1000841 continued: 0,
Nigel Taof6843712020-02-22 08:17:03 +1100842 length: 1)
Nigel Tao3fe3f842020-04-02 21:56:53 +1100843 break
844 } endwhile
Nigel Taof6843712020-02-22 08:17:03 +1100845
Nigel Tao3fe3f842020-04-02 21:56:53 +1100846 // As above, expect must have contained EXPECT_STRING. If it didn't
Nigel Tao66b0a122020-06-09 23:48:54 +1000847 // also contain EXPECT_NUMBER (excluding EXPECT_COMMENT) then we
848 // were parsing an object key and the next token should be ':'.
Nigel Tao80071732020-04-13 16:06:16 +1000849 if 0 == (expect & ((1 as base.u32) << CLASS_NUMBER)) {
Nigel Tao66b0a122020-06-09 23:48:54 +1000850 expect = EXPECT_COLON
Nigel Tao3fe3f842020-04-02 21:56:53 +1100851 continue.outer
852 }
853 break.goto_parsed_a_leaf_value
854 // -------- END parse strings.
855
Nigel Tao80071732020-04-13 16:06:16 +1000856 } else if class == CLASS_COMMA {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000857 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao42843692020-08-17 16:48:30 +1000858 // The ',' is punctuation (filler).
Nigel Tao3fe3f842020-04-02 21:56:53 +1100859 args.dst.write_simple_token_fast!(
860 value_major: 0,
Nigel Tao42843692020-08-17 16:48:30 +1000861 value_minor: (base.TOKEN__VBC__FILLER << 21) |
862 base.TOKEN__VBD__FILLER__PUNCTUATION,
Nigel Tao496e88b2020-04-09 22:10:08 +1000863 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100864 length: 1)
865 // What's valid after a comma depends on whether or not we're in an
866 // array or an object.
Nigel Tao80071732020-04-13 16:06:16 +1000867 if 0 == (expect & ((1 as base.u32) << CLASS_CLOSE_SQUARE_BRACKET)) {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000868 if this.quirks[QUIRK_ALLOW_EXTRA_COMMA - QUIRKS_BASE] {
Nigel Tao66b0a122020-06-09 23:48:54 +1000869 expect = EXPECT_STRING | EXPECT_CLOSE_CURLY_BRACE
Nigel Taof6843712020-02-22 08:17:03 +1100870 } else {
Nigel Tao66b0a122020-06-09 23:48:54 +1000871 expect = EXPECT_STRING
Nigel Taof6843712020-02-22 08:17:03 +1100872 }
Nigel Tao3fe3f842020-04-02 21:56:53 +1100873 } else {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000874 if this.quirks[QUIRK_ALLOW_EXTRA_COMMA - QUIRKS_BASE] {
Nigel Tao66b0a122020-06-09 23:48:54 +1000875 expect = EXPECT_VALUE | EXPECT_CLOSE_SQUARE_BRACKET
Nigel Tao01bab822020-02-23 09:03:10 +1100876 } else {
Nigel Tao66b0a122020-06-09 23:48:54 +1000877 expect = EXPECT_VALUE
Nigel Taoeb06ed72020-03-22 21:07:12 +1100878 }
Nigel Taoc92c0f52020-02-21 11:16:27 +1100879 }
Nigel Tao3fe3f842020-04-02 21:56:53 +1100880 continue.outer
Nigel Taoc92c0f52020-02-21 11:16:27 +1100881
Nigel Tao80071732020-04-13 16:06:16 +1000882 } else if class == CLASS_COLON {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000883 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao42843692020-08-17 16:48:30 +1000884 // The ':' is punctuation (filler).
Nigel Tao3fe3f842020-04-02 21:56:53 +1100885 args.dst.write_simple_token_fast!(
886 value_major: 0,
Nigel Tao42843692020-08-17 16:48:30 +1000887 value_minor: (base.TOKEN__VBC__FILLER << 21) |
888 base.TOKEN__VBD__FILLER__PUNCTUATION,
Nigel Tao496e88b2020-04-09 22:10:08 +1000889 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100890 length: 1)
Nigel Tao66b0a122020-06-09 23:48:54 +1000891 expect = EXPECT_VALUE
Nigel Tao3fe3f842020-04-02 21:56:53 +1100892 continue.outer
893
Nigel Tao80071732020-04-13 16:06:16 +1000894 } else if class == CLASS_NUMBER {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100895 // -------- BEGIN parse numbers.
896 while true,
Nigel Taobe8542e2020-08-13 23:26:08 +1000897 pre args.dst.length() > 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100898 {
899 number_length = this.decode_number!(src: args.src)
Nigel Tao84bb3af2020-07-07 23:29:30 +1000900 number_status = number_length >> 8
Nigel Tao3d31ac02020-06-09 23:34:42 +1000901 vminor = (base.TOKEN__VBC__NUMBER << 21) |
902 base.TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT |
903 base.TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED |
904 base.TOKEN__VBD__NUMBER__FORMAT_TEXT
Nigel Tao84bb3af2020-07-07 23:29:30 +1000905 if (number_length & 0x80) <> 0 {
Nigel Tao3d31ac02020-06-09 23:34:42 +1000906 vminor = (base.TOKEN__VBC__NUMBER << 21) |
907 base.TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT |
908 base.TOKEN__VBD__NUMBER__FORMAT_TEXT
Nigel Tao3fe3f842020-04-02 21:56:53 +1100909 }
Nigel Tao84bb3af2020-07-07 23:29:30 +1000910 number_length = number_length & 0x7F
Nigel Tao3fe3f842020-04-02 21:56:53 +1100911 if number_status == 0 {
912 args.dst.write_simple_token_fast!(
913 value_major: 0,
914 value_minor: vminor,
Nigel Tao496e88b2020-04-09 22:10:08 +1000915 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100916 length: number_length)
917 break
918 }
919
920 while number_length > 0 {
921 number_length -= 1
922 if args.src.can_undo_byte() {
923 args.src.undo_byte!()
924 } else {
925 return "#internal error: inconsistent I/O"
926 }
927 } endwhile
928
929 if number_status == 1 {
Nigel Taoe39f3cb2020-04-14 23:03:18 +1000930 if this.quirks[QUIRK_ALLOW_INF_NAN_NUMBERS - QUIRKS_BASE] {
Nigel Tao3fe3f842020-04-02 21:56:53 +1100931 this.decode_inf_nan?(dst: args.dst, src: args.src)
932 break
933 }
934 return "#bad input"
935 } else if number_status == 2 {
936 return "#unsupported number length"
937 } else {
938 yield? base."$short read"
Nigel Taobe8542e2020-08-13 23:26:08 +1000939 while args.dst.length() <= 0,
940 post args.dst.length() > 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100941 {
942 yield? base."$short write"
943 } endwhile
944 }
945 } endwhile
946 break.goto_parsed_a_leaf_value
947 // -------- END parse numbers.
948
Nigel Tao80071732020-04-13 16:06:16 +1000949 } else if class == CLASS_OPEN_CURLY_BRACE {
Nigel Tao3d31ac02020-06-09 23:34:42 +1000950 vminor = (base.TOKEN__VBC__STRUCTURE << 21) |
951 base.TOKEN__VBD__STRUCTURE__PUSH |
952 base.TOKEN__VBD__STRUCTURE__FROM_NONE |
953 base.TOKEN__VBD__STRUCTURE__TO_DICT
Nigel Tao3fe3f842020-04-02 21:56:53 +1100954 if depth == 0 {
955 // No-op.
Nigel Tao80071732020-04-13 16:06:16 +1000956 } else if 0 <> (expect_after_value & ((1 as base.u32) << CLASS_CLOSE_CURLY_BRACE)) {
Nigel Tao3d31ac02020-06-09 23:34:42 +1000957 vminor = (base.TOKEN__VBC__STRUCTURE << 21) |
958 base.TOKEN__VBD__STRUCTURE__PUSH |
959 base.TOKEN__VBD__STRUCTURE__FROM_DICT |
960 base.TOKEN__VBD__STRUCTURE__TO_DICT
Nigel Tao3fe3f842020-04-02 21:56:53 +1100961 } else {
Nigel Tao3d31ac02020-06-09 23:34:42 +1000962 vminor = (base.TOKEN__VBC__STRUCTURE << 21) |
963 base.TOKEN__VBD__STRUCTURE__PUSH |
964 base.TOKEN__VBD__STRUCTURE__FROM_LIST |
965 base.TOKEN__VBD__STRUCTURE__TO_DICT
Nigel Tao3fe3f842020-04-02 21:56:53 +1100966 }
967 if depth >= 1024 {
968 return "#unsupported recursion depth"
969 }
970 stack_byte = depth / 32
971 stack_bit = depth & 31
972 this.stack[stack_byte] |= (1 as base.u32) << stack_bit
973 depth += 1
974
Nigel Tao8b70ad02020-05-27 23:28:44 +1000975 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3fe3f842020-04-02 21:56:53 +1100976 args.dst.write_simple_token_fast!(
977 value_major: 0,
978 value_minor: vminor,
Nigel Tao496e88b2020-04-09 22:10:08 +1000979 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100980 length: 1)
Nigel Tao66b0a122020-06-09 23:48:54 +1000981 expect = EXPECT_CLOSE_CURLY_BRACE | EXPECT_STRING
982 expect_after_value = EXPECT_CLOSE_CURLY_BRACE | EXPECT_COMMA
Nigel Tao3fe3f842020-04-02 21:56:53 +1100983 continue.outer
984
Nigel Tao80071732020-04-13 16:06:16 +1000985 } else if class == CLASS_CLOSE_CURLY_BRACE {
Nigel Tao8b70ad02020-05-27 23:28:44 +1000986 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3fe3f842020-04-02 21:56:53 +1100987 if depth <= 1 {
988 args.dst.write_simple_token_fast!(
989 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +1000990 value_minor: (base.TOKEN__VBC__STRUCTURE << 21) |
991 base.TOKEN__VBD__STRUCTURE__POP |
992 base.TOKEN__VBD__STRUCTURE__FROM_DICT |
993 base.TOKEN__VBD__STRUCTURE__TO_NONE,
Nigel Tao496e88b2020-04-09 22:10:08 +1000994 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +1100995 length: 1)
996 break.outer
997 }
998 depth -= 1
999 stack_byte = (depth - 1) / 32
1000 stack_bit = (depth - 1) & 31
1001 if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) {
1002 args.dst.write_simple_token_fast!(
1003 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001004 value_minor: (base.TOKEN__VBC__STRUCTURE << 21) |
1005 base.TOKEN__VBD__STRUCTURE__POP |
1006 base.TOKEN__VBD__STRUCTURE__FROM_DICT |
1007 base.TOKEN__VBD__STRUCTURE__TO_LIST,
Nigel Tao496e88b2020-04-09 22:10:08 +10001008 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001009 length: 1)
Nigel Tao66b0a122020-06-09 23:48:54 +10001010 expect = EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_COMMA
1011 expect_after_value = EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_COMMA
Nigel Tao3fe3f842020-04-02 21:56:53 +11001012 } else {
1013 args.dst.write_simple_token_fast!(
1014 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001015 value_minor: (base.TOKEN__VBC__STRUCTURE << 21) |
1016 base.TOKEN__VBD__STRUCTURE__POP |
1017 base.TOKEN__VBD__STRUCTURE__FROM_DICT |
1018 base.TOKEN__VBD__STRUCTURE__TO_DICT,
Nigel Tao496e88b2020-04-09 22:10:08 +10001019 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001020 length: 1)
Nigel Tao66b0a122020-06-09 23:48:54 +10001021 expect = EXPECT_CLOSE_CURLY_BRACE | EXPECT_COMMA
1022 expect_after_value = EXPECT_CLOSE_CURLY_BRACE | EXPECT_COMMA
Nigel Tao3fe3f842020-04-02 21:56:53 +11001023 }
1024 continue.outer
1025
Nigel Tao80071732020-04-13 16:06:16 +10001026 } else if class == CLASS_OPEN_SQUARE_BRACKET {
Nigel Tao3d31ac02020-06-09 23:34:42 +10001027 vminor = (base.TOKEN__VBC__STRUCTURE << 21) |
1028 base.TOKEN__VBD__STRUCTURE__PUSH |
1029 base.TOKEN__VBD__STRUCTURE__FROM_NONE |
1030 base.TOKEN__VBD__STRUCTURE__TO_LIST
Nigel Tao3fe3f842020-04-02 21:56:53 +11001031 if depth == 0 {
1032 // No-op.
Nigel Tao80071732020-04-13 16:06:16 +10001033 } else if 0 <> (expect_after_value & ((1 as base.u32) << CLASS_CLOSE_CURLY_BRACE)) {
Nigel Tao3d31ac02020-06-09 23:34:42 +10001034 vminor = (base.TOKEN__VBC__STRUCTURE << 21) |
1035 base.TOKEN__VBD__STRUCTURE__PUSH |
1036 base.TOKEN__VBD__STRUCTURE__FROM_DICT |
1037 base.TOKEN__VBD__STRUCTURE__TO_LIST
Nigel Tao3fe3f842020-04-02 21:56:53 +11001038 } else {
Nigel Tao3d31ac02020-06-09 23:34:42 +10001039 vminor = (base.TOKEN__VBC__STRUCTURE << 21) |
1040 base.TOKEN__VBD__STRUCTURE__PUSH |
1041 base.TOKEN__VBD__STRUCTURE__FROM_LIST |
1042 base.TOKEN__VBD__STRUCTURE__TO_LIST
Nigel Tao3fe3f842020-04-02 21:56:53 +11001043 }
1044 if depth >= 1024 {
1045 return "#unsupported recursion depth"
1046 }
1047 stack_byte = depth / 32
1048 stack_bit = depth & 31
1049 this.stack[stack_byte] &= 0xFFFF_FFFF ^ ((1 as base.u32) << stack_bit)
1050 depth += 1
1051
Nigel Tao8b70ad02020-05-27 23:28:44 +10001052 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3fe3f842020-04-02 21:56:53 +11001053 args.dst.write_simple_token_fast!(
1054 value_major: 0,
1055 value_minor: vminor,
Nigel Tao496e88b2020-04-09 22:10:08 +10001056 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001057 length: 1)
Nigel Tao66b0a122020-06-09 23:48:54 +10001058 expect = EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_VALUE
1059 expect_after_value = EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_COMMA
Nigel Tao3fe3f842020-04-02 21:56:53 +11001060 continue.outer
1061
Nigel Tao80071732020-04-13 16:06:16 +10001062 } else if class == CLASS_CLOSE_SQUARE_BRACKET {
Nigel Tao8b70ad02020-05-27 23:28:44 +10001063 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3fe3f842020-04-02 21:56:53 +11001064 if depth <= 1 {
1065 args.dst.write_simple_token_fast!(
1066 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001067 value_minor: (base.TOKEN__VBC__STRUCTURE << 21) |
1068 base.TOKEN__VBD__STRUCTURE__POP |
1069 base.TOKEN__VBD__STRUCTURE__FROM_LIST |
1070 base.TOKEN__VBD__STRUCTURE__TO_NONE,
Nigel Tao496e88b2020-04-09 22:10:08 +10001071 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001072 length: 1)
1073 break.outer
1074 }
1075 depth -= 1
1076 stack_byte = (depth - 1) / 32
1077 stack_bit = (depth - 1) & 31
1078 if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) {
1079 args.dst.write_simple_token_fast!(
1080 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001081 value_minor: (base.TOKEN__VBC__STRUCTURE << 21) |
1082 base.TOKEN__VBD__STRUCTURE__POP |
1083 base.TOKEN__VBD__STRUCTURE__FROM_LIST |
1084 base.TOKEN__VBD__STRUCTURE__TO_LIST,
Nigel Tao496e88b2020-04-09 22:10:08 +10001085 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001086 length: 1)
Nigel Tao66b0a122020-06-09 23:48:54 +10001087 expect = EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_COMMA
1088 expect_after_value = EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_COMMA
Nigel Tao3fe3f842020-04-02 21:56:53 +11001089 } else {
1090 args.dst.write_simple_token_fast!(
1091 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001092 value_minor: (base.TOKEN__VBC__STRUCTURE << 21) |
1093 base.TOKEN__VBD__STRUCTURE__POP |
1094 base.TOKEN__VBD__STRUCTURE__FROM_LIST |
1095 base.TOKEN__VBD__STRUCTURE__TO_DICT,
Nigel Tao496e88b2020-04-09 22:10:08 +10001096 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001097 length: 1)
Nigel Tao66b0a122020-06-09 23:48:54 +10001098 expect = EXPECT_CLOSE_CURLY_BRACE | EXPECT_COMMA
1099 expect_after_value = EXPECT_CLOSE_CURLY_BRACE | EXPECT_COMMA
Nigel Tao3fe3f842020-04-02 21:56:53 +11001100 }
1101 continue.outer
1102
Nigel Tao80071732020-04-13 16:06:16 +10001103 } else if class == CLASS_FALSE {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001104 match = args.src.match7(a: '\x05false'le)
1105 if match == 0 {
1106 args.dst.write_simple_token_fast!(
1107 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001108 value_minor: (base.TOKEN__VBC__LITERAL << 21) |
1109 base.TOKEN__VBD__LITERAL__FALSE,
Nigel Tao496e88b2020-04-09 22:10:08 +10001110 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001111 length: 5)
Nigel Taobe8542e2020-08-13 23:26:08 +10001112 if args.src.length() < 5 {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001113 return "#internal error: inconsistent I/O"
1114 }
Nigel Tao8b70ad02020-05-27 23:28:44 +10001115 args.src.skip_u32_fast!(actual: 5, worst_case: 5)
Nigel Tao3fe3f842020-04-02 21:56:53 +11001116 break.goto_parsed_a_leaf_value
1117 } else if match == 1 {
1118 yield? base."$short read"
1119 continue.outer
1120 }
1121
Nigel Tao80071732020-04-13 16:06:16 +10001122 } else if class == CLASS_TRUE {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001123 match = args.src.match7(a: '\x04true'le)
1124 if match == 0 {
1125 args.dst.write_simple_token_fast!(
1126 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001127 value_minor: (base.TOKEN__VBC__LITERAL << 21) |
1128 base.TOKEN__VBD__LITERAL__TRUE,
Nigel Tao496e88b2020-04-09 22:10:08 +10001129 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001130 length: 4)
Nigel Taobe8542e2020-08-13 23:26:08 +10001131 if args.src.length() < 4 {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001132 return "#internal error: inconsistent I/O"
1133 }
Nigel Tao8b70ad02020-05-27 23:28:44 +10001134 args.src.skip_u32_fast!(actual: 4, worst_case: 4)
Nigel Tao3fe3f842020-04-02 21:56:53 +11001135 break.goto_parsed_a_leaf_value
1136 } else if match == 1 {
1137 yield? base."$short read"
1138 continue.outer
1139 }
1140
Nigel Tao80071732020-04-13 16:06:16 +10001141 } else if class == CLASS_NULL_NAN_INF {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001142 match = args.src.match7(a: '\x04null'le)
1143 if match == 0 {
1144 args.dst.write_simple_token_fast!(
1145 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001146 value_minor: (base.TOKEN__VBC__LITERAL << 21) |
1147 base.TOKEN__VBD__LITERAL__NULL,
Nigel Tao496e88b2020-04-09 22:10:08 +10001148 continued: 0,
Nigel Tao3fe3f842020-04-02 21:56:53 +11001149 length: 4)
Nigel Taobe8542e2020-08-13 23:26:08 +10001150 if args.src.length() < 4 {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001151 return "#internal error: inconsistent I/O"
1152 }
Nigel Tao8b70ad02020-05-27 23:28:44 +10001153 args.src.skip_u32_fast!(actual: 4, worst_case: 4)
Nigel Tao3fe3f842020-04-02 21:56:53 +11001154 break.goto_parsed_a_leaf_value
1155 } else if match == 1 {
1156 yield? base."$short read"
1157 continue.outer
1158 }
1159
Nigel Taoe39f3cb2020-04-14 23:03:18 +10001160 if this.quirks[QUIRK_ALLOW_INF_NAN_NUMBERS - QUIRKS_BASE] {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001161 this.decode_inf_nan?(dst: args.dst, src: args.src)
1162 break.goto_parsed_a_leaf_value
1163 }
1164
Nigel Tao80071732020-04-13 16:06:16 +10001165 } else if class == CLASS_COMMENT {
Nigel Taoe39f3cb2020-04-14 23:03:18 +10001166 if this.quirks[QUIRK_ALLOW_COMMENT_BLOCK - QUIRKS_BASE] or
1167 this.quirks[QUIRK_ALLOW_COMMENT_LINE - QUIRKS_BASE] {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001168 this.decode_comment?(dst: args.dst, src: args.src)
Nigel Taocd4cbc92020-09-22 22:22:15 +10001169 if this.comment_type > 0 {
1170 continue.outer
1171 }
Nigel Tao3fe3f842020-04-02 21:56:53 +11001172 }
1173 }
1174
1175 return "#bad input"
1176 }} endwhile.goto_parsed_a_leaf_value
Nigel Taoea91e5a2020-02-13 12:52:53 +11001177
Nigel Tao93b09672020-02-21 11:42:33 +11001178 // We've just parsed a leaf (non-container) value: literal (null,
1179 // false, true), number or string.
Nigel Taoea91e5a2020-02-13 12:52:53 +11001180 if depth == 0 {
1181 break.outer
1182 }
Nigel Tao40778f02020-02-13 14:19:07 +11001183 expect = expect_after_value
Nigel Taob1e3a152020-02-22 13:06:12 +11001184 } endwhile.outer
Nigel Tao791437b2020-03-17 14:14:16 +11001185
Nigel Taocd4cbc92020-09-22 22:22:15 +10001186 if this.quirks[QUIRK_ALLOW_TRAILING_FILLER - QUIRKS_BASE] or
1187 this.quirks[QUIRK_EXPECT_TRAILING_NEW_LINE_OR_EOF - QUIRKS_BASE] {
1188 this.decode_trailer?(dst: args.dst, src: args.src)
Nigel Tao4b186b02020-03-18 14:25:21 +11001189 }
1190
Nigel Tao791437b2020-03-17 14:14:16 +11001191 this.end_of_data = true
Nigel Tao7a142162020-02-11 10:05:53 +11001192}
Nigel Tao8850d382020-02-19 12:25:00 +11001193
Nigel Tao84bb3af2020-07-07 23:29:30 +10001194pri func decoder.decode_number!(src: base.io_reader) base.u32[..= 0x3FF] {
Nigel Tao9a5b6352020-02-20 22:25:04 +11001195 var c : base.u8
Nigel Tao84bb3af2020-07-07 23:29:30 +10001196 var n : base.u32[..= 0x3FF]
1197 var floating_point : base.u32[..= 0x80]
Nigel Tao8850d382020-02-19 12:25:00 +11001198
Nigel Tao3fe3f842020-04-02 21:56:53 +11001199 while.goto_done true {{
1200 n = 0
1201
1202 // Peek.
Nigel Taobe8542e2020-08-13 23:26:08 +10001203 if args.src.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001204 if not args.src.is_closed() {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001205 n |= 0x300
Nigel Tao3fe3f842020-04-02 21:56:53 +11001206 }
1207 break.goto_done
1208 }
1209 c = args.src.peek_u8()
1210
1211 // Scan the optional minus sign.
1212 if c <> '-' {
Nigel Taobe8542e2020-08-13 23:26:08 +10001213 assert args.src.length() > 0
Nigel Tao3fe3f842020-04-02 21:56:53 +11001214 assert n <= 1
1215 } else {
1216 n += 1
Nigel Tao8b70ad02020-05-27 23:28:44 +10001217 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao8850d382020-02-19 12:25:00 +11001218
Nigel Tao7acb35f2020-02-20 11:41:39 +11001219 // Peek.
Nigel Taobe8542e2020-08-13 23:26:08 +10001220 if args.src.length() <= 0 {
Nigel Tao7acb35f2020-02-20 11:41:39 +11001221 if not args.src.is_closed() {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001222 n |= 0x300
Nigel Tao7acb35f2020-02-20 11:41:39 +11001223 }
Nigel Tao84bb3af2020-07-07 23:29:30 +10001224 n |= 0x100 // A '-' without digits is invalid.
Nigel Tao7acb35f2020-02-20 11:41:39 +11001225 break.goto_done
1226 }
1227 c = args.src.peek_u8()
1228
Nigel Taobe8542e2020-08-13 23:26:08 +10001229 assert args.src.length() > 0
Nigel Tao3fe3f842020-04-02 21:56:53 +11001230 assert n <= 1
1231 }
Nigel Tao7acb35f2020-02-20 11:41:39 +11001232
Nigel Tao3fe3f842020-04-02 21:56:53 +11001233 // Scan the opening digits.
1234 if c == '0' {
1235 n += 1
Nigel Tao8b70ad02020-05-27 23:28:44 +10001236 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao84bb3af2020-07-07 23:29:30 +10001237 assert n <= 99
Nigel Tao3fe3f842020-04-02 21:56:53 +11001238 } else {
1239 n = this.decode_digits!(src: args.src, n: n)
Nigel Tao84bb3af2020-07-07 23:29:30 +10001240 if n > 99 {
Nigel Tao7acb35f2020-02-20 11:41:39 +11001241 break.goto_done
1242 }
Nigel Tao84bb3af2020-07-07 23:29:30 +10001243 assert n <= 99
Nigel Tao3fe3f842020-04-02 21:56:53 +11001244 }
Nigel Tao7acb35f2020-02-20 11:41:39 +11001245
Nigel Tao3fe3f842020-04-02 21:56:53 +11001246 // Peek.
Nigel Taobe8542e2020-08-13 23:26:08 +10001247 if args.src.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001248 if not args.src.is_closed() {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001249 n |= 0x300
Nigel Tao7acb35f2020-02-20 11:41:39 +11001250 }
Nigel Tao3fe3f842020-04-02 21:56:53 +11001251 break.goto_done
1252 }
1253 c = args.src.peek_u8()
Nigel Tao7acb35f2020-02-20 11:41:39 +11001254
Nigel Tao3fe3f842020-04-02 21:56:53 +11001255 // Scan the optional fraction.
1256 if c <> '.' {
Nigel Taobe8542e2020-08-13 23:26:08 +10001257 assert args.src.length() > 0
Nigel Tao84bb3af2020-07-07 23:29:30 +10001258 assert n <= 99
Nigel Tao3fe3f842020-04-02 21:56:53 +11001259 } else {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001260 if n >= 99 {
1261 n |= 0x200
Nigel Taoa5184ed2020-03-06 21:05:44 +11001262 break.goto_done
1263 }
Nigel Tao7acb35f2020-02-20 11:41:39 +11001264 n += 1
Nigel Tao8b70ad02020-05-27 23:28:44 +10001265 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao84bb3af2020-07-07 23:29:30 +10001266 floating_point = 0x80
Nigel Tao3fe3f842020-04-02 21:56:53 +11001267
1268 n = this.decode_digits!(src: args.src, n: n)
Nigel Tao84bb3af2020-07-07 23:29:30 +10001269 if n > 99 {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001270 break.goto_done
1271 }
Nigel Tao7acb35f2020-02-20 11:41:39 +11001272
1273 // Peek.
Nigel Taobe8542e2020-08-13 23:26:08 +10001274 if args.src.length() <= 0 {
Nigel Tao7acb35f2020-02-20 11:41:39 +11001275 if not args.src.is_closed() {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001276 n |= 0x300
Nigel Tao7acb35f2020-02-20 11:41:39 +11001277 }
Nigel Tao7acb35f2020-02-20 11:41:39 +11001278 break.goto_done
1279 }
1280 c = args.src.peek_u8()
1281
Nigel Taobe8542e2020-08-13 23:26:08 +10001282 assert args.src.length() > 0
Nigel Tao84bb3af2020-07-07 23:29:30 +10001283 assert n <= 99
Nigel Tao3fe3f842020-04-02 21:56:53 +11001284 }
Nigel Tao7acb35f2020-02-20 11:41:39 +11001285
Nigel Tao3fe3f842020-04-02 21:56:53 +11001286 // Scan the optional 'E' or 'e'.
1287 if (c <> 'E') and (c <> 'e') {
Nigel Tao7acb35f2020-02-20 11:41:39 +11001288 break.goto_done
Nigel Tao3fe3f842020-04-02 21:56:53 +11001289 }
Nigel Tao84bb3af2020-07-07 23:29:30 +10001290 if n >= 99 {
1291 n |= 0x200
Nigel Tao3fe3f842020-04-02 21:56:53 +11001292 break.goto_done
1293 }
1294 n += 1
Nigel Tao8b70ad02020-05-27 23:28:44 +10001295 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao84bb3af2020-07-07 23:29:30 +10001296 floating_point = 0x80
1297 assert n <= 99
Nigel Tao3fe3f842020-04-02 21:56:53 +11001298
1299 // Peek.
Nigel Taobe8542e2020-08-13 23:26:08 +10001300 if args.src.length() <= 0 {
Nigel Tao3fe3f842020-04-02 21:56:53 +11001301 if not args.src.is_closed() {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001302 n |= 0x300
Nigel Tao3fe3f842020-04-02 21:56:53 +11001303 }
Nigel Tao84bb3af2020-07-07 23:29:30 +10001304 n |= 0x100 // An 'E' or 'e' without digits is invalid.
Nigel Tao3fe3f842020-04-02 21:56:53 +11001305 break.goto_done
1306 }
1307 c = args.src.peek_u8()
1308
1309 // Scan the optional '+' or '-'.
1310 if (c <> '+') and (c <> '-') {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001311 assert n <= 99
Nigel Tao3fe3f842020-04-02 21:56:53 +11001312 } else {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001313 if n >= 99 {
1314 n |= 0x200
Nigel Tao3fe3f842020-04-02 21:56:53 +11001315 break.goto_done
1316 }
1317 n += 1
Nigel Tao8b70ad02020-05-27 23:28:44 +10001318 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao84bb3af2020-07-07 23:29:30 +10001319 assert n <= 99
Nigel Tao3fe3f842020-04-02 21:56:53 +11001320 }
1321
1322 // Scan the exponent digits.
1323 n = this.decode_digits!(src: args.src, n: n)
1324
1325 break.goto_done
1326 }} endwhile.goto_done
1327
Nigel Tao9a5b6352020-02-20 22:25:04 +11001328 return n | floating_point
Nigel Tao7acb35f2020-02-20 11:41:39 +11001329}
1330
Nigel Tao84bb3af2020-07-07 23:29:30 +10001331pri func decoder.decode_digits!(src: base.io_reader, n: base.u32[..= 99]) base.u32[..= 0x3FF] {
Nigel Tao7acb35f2020-02-20 11:41:39 +11001332 var c : base.u8
Nigel Tao84bb3af2020-07-07 23:29:30 +10001333 var n : base.u32[..= 0x3FF]
Nigel Tao7acb35f2020-02-20 11:41:39 +11001334
1335 n = args.n
Nigel Tao8850d382020-02-19 12:25:00 +11001336 while true {
Nigel Taobe8542e2020-08-13 23:26:08 +10001337 if args.src.length() <= 0 {
Nigel Tao9d35cf02020-02-19 21:36:23 +11001338 if not args.src.is_closed() {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001339 n |= 0x300
Nigel Tao8850d382020-02-19 12:25:00 +11001340 }
1341 break
1342 }
1343 c = args.src.peek_u8()
Nigel Tao74871342020-04-13 15:52:36 +10001344 if 0x00 == LUT_DECIMAL_DIGITS[c] {
Nigel Tao9d35cf02020-02-19 21:36:23 +11001345 break
Nigel Tao8850d382020-02-19 12:25:00 +11001346 }
Nigel Tao84bb3af2020-07-07 23:29:30 +10001347 // Cap DECODER_NUMBER_LENGTH_MAX_INCL at an arbitrary value, 99. The
1348 // caller's src.data.len should therefore be at least 100, also known
Nigel Tao74871342020-04-13 15:52:36 +10001349 // as DECODER_SRC_IO_BUFFER_LENGTH_MIN_INCL.
Nigel Tao83ae50c2020-02-24 11:10:01 +11001350 //
1351 // An example of a JSON number that is 81 bytes long is:
1352 // https://github.com/nst/JSONTestSuite/blob/master/test_parsing/y_number_double_close_to_zero.json
1353 //
Nigel Tao84bb3af2020-07-07 23:29:30 +10001354 // Note that 99 (in hex, 0x63) is less than 0x80, so we can use 0x80 as
1355 // a flag bit in func decoder.decode_number.
1356 if n >= 99 {
1357 n |= 0x200
Nigel Tao9d35cf02020-02-19 21:36:23 +11001358 break
Nigel Tao8850d382020-02-19 12:25:00 +11001359 }
1360 n += 1
Nigel Tao8b70ad02020-05-27 23:28:44 +10001361 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao58351d42020-03-25 21:38:31 +11001362 } endwhile
Nigel Tao7acb35f2020-02-20 11:41:39 +11001363 if n == args.n {
Nigel Tao84bb3af2020-07-07 23:29:30 +10001364 n |= 0x100
Nigel Tao7acb35f2020-02-20 11:41:39 +11001365 }
Nigel Tao9d35cf02020-02-19 21:36:23 +11001366 return n
Nigel Tao8850d382020-02-19 12:25:00 +11001367}
Nigel Tao3a75d972020-03-17 22:04:56 +11001368
1369pri func decoder.decode_leading?(dst: base.token_writer, src: base.io_reader) {
1370 var c : base.u8
1371 var u : base.u32
1372
Nigel Taoe39f3cb2020-04-14 23:03:18 +10001373 this.allow_leading_ars =
1374 this.quirks[QUIRK_ALLOW_LEADING_ASCII_RECORD_SEPARATOR - QUIRKS_BASE]
1375 this.allow_leading_ubom =
1376 this.quirks[QUIRK_ALLOW_LEADING_UNICODE_BYTE_ORDER_MARK - QUIRKS_BASE]
Nigel Tao3fe3f842020-04-02 21:56:53 +11001377
Nigel Tao3a75d972020-03-17 22:04:56 +11001378 while this.allow_leading_ars or this.allow_leading_ubom {
Nigel Taobe8542e2020-08-13 23:26:08 +10001379 if args.dst.length() <= 0 {
Nigel Tao3a75d972020-03-17 22:04:56 +11001380 yield? base."$short write"
1381 continue
1382 }
Nigel Taobe8542e2020-08-13 23:26:08 +10001383 if args.src.length() <= 0 {
Nigel Tao3a75d972020-03-17 22:04:56 +11001384 if args.src.is_closed() {
1385 break
1386 }
1387 yield? base."$short read"
1388 continue
1389 }
1390 c = args.src.peek_u8()
1391 if (c == 0x1E) and this.allow_leading_ars {
1392 this.allow_leading_ars = false
Nigel Tao8b70ad02020-05-27 23:28:44 +10001393 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Tao3456f392020-04-02 12:26:57 +11001394 args.dst.write_simple_token_fast!(
Nigel Tao496e88b2020-04-09 22:10:08 +10001395 value_major: 0, value_minor: 0, continued: 0, length: 1)
Nigel Tao3a75d972020-03-17 22:04:56 +11001396 continue
1397 } else if (c == 0xEF) and this.allow_leading_ubom {
Nigel Taobe8542e2020-08-13 23:26:08 +10001398 if args.src.length() < 3 {
Nigel Tao3a75d972020-03-17 22:04:56 +11001399 if args.src.is_closed() {
1400 break
1401 }
1402 yield? base."$short read"
1403 continue
1404 }
1405 u = args.src.peek_u24le_as_u32()
1406 if u == 0xBF_BBEF {
1407 this.allow_leading_ubom = false
Nigel Tao8b70ad02020-05-27 23:28:44 +10001408 args.src.skip_u32_fast!(actual: 3, worst_case: 3)
Nigel Tao3456f392020-04-02 12:26:57 +11001409 args.dst.write_simple_token_fast!(
Nigel Tao496e88b2020-04-09 22:10:08 +10001410 value_major: 0, value_minor: 0, continued: 0, length: 3)
Nigel Tao3a75d972020-03-17 22:04:56 +11001411 continue
1412 }
1413 }
1414 break
Nigel Tao58351d42020-03-25 21:38:31 +11001415 } endwhile
Nigel Tao3a75d972020-03-17 22:04:56 +11001416}
Nigel Tao4b186b02020-03-18 14:25:21 +11001417
Nigel Taoeb06ed72020-03-22 21:07:12 +11001418pri func decoder.decode_comment?(dst: base.token_writer, src: base.io_reader) {
Nigel Tao496e88b2020-04-09 22:10:08 +10001419 var c : base.u8
1420 var c2 : base.u16
1421 var length : base.u32[..= 0xFFFD]
Nigel Taoeb06ed72020-03-22 21:07:12 +11001422
Nigel Taofa50f4d2020-09-21 11:07:36 +10001423 this.comment_type = 0
1424
Nigel Taobe8542e2020-08-13 23:26:08 +10001425 while (args.dst.length() <= 0) or (args.src.length() <= 1),
1426 post args.dst.length() > 0,
1427 post args.src.length() > 1,
Nigel Taoeb06ed72020-03-22 21:07:12 +11001428 {
Nigel Taobe8542e2020-08-13 23:26:08 +10001429 if args.dst.length() <= 0 {
Nigel Taoeb06ed72020-03-22 21:07:12 +11001430 yield? base."$short write"
1431 continue
1432 }
1433 if args.src.is_closed() {
Nigel Taocd4cbc92020-09-22 22:22:15 +10001434 return ok
Nigel Taoeb06ed72020-03-22 21:07:12 +11001435 }
1436 yield? base."$short read"
Nigel Tao58351d42020-03-25 21:38:31 +11001437 } endwhile
Nigel Tao2ca0ef72020-03-24 14:02:08 +11001438 c2 = args.src.peek_u16le()
Nigel Taoeb06ed72020-03-22 21:07:12 +11001439
Nigel Taoe39f3cb2020-04-14 23:03:18 +10001440 if (c2 == '/*'le) and this.quirks[QUIRK_ALLOW_COMMENT_BLOCK - QUIRKS_BASE] {
Nigel Tao8b70ad02020-05-27 23:28:44 +10001441 args.src.skip_u32_fast!(actual: 2, worst_case: 2)
Nigel Taoeb06ed72020-03-22 21:07:12 +11001442 length = 2
1443
Nigel Tao5a93c632022-01-25 21:24:54 +11001444 while.comment_block true,
1445 pre args.dst.length() > 0,
1446 {
Nigel Tao7b512662022-01-25 21:26:27 +11001447 if args.src.length() <= 1 {
1448 if length > 0 {
Nigel Tao3456f392020-04-02 12:26:57 +11001449 args.dst.write_simple_token_fast!(
Nigel Taoeb06ed72020-03-22 21:07:12 +11001450 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001451 value_minor: (base.TOKEN__VBC__FILLER << 21) |
1452 base.TOKEN__VBD__FILLER__COMMENT_BLOCK,
Nigel Tao496e88b2020-04-09 22:10:08 +10001453 continued: 1,
Nigel Tao7b512662022-01-25 21:26:27 +11001454 length: length)
Nigel Taoeb06ed72020-03-22 21:07:12 +11001455 }
Nigel Tao7b512662022-01-25 21:26:27 +11001456 if args.src.is_closed() {
1457 return "#bad input"
1458 }
1459 yield? base."$short read"
1460 while args.dst.length() <= 0,
1461 post args.dst.length() > 0,
1462 {
1463 yield? base."$short write"
1464 } endwhile
1465 length = 0
1466 continue.comment_block
1467 }
1468
1469 c2 = args.src.peek_u16le()
1470 if c2 == '*/'le {
1471 args.src.skip_u32_fast!(actual: 2, worst_case: 2)
1472 args.dst.write_simple_token_fast!(
1473 value_major: 0,
1474 value_minor: (base.TOKEN__VBC__FILLER << 21) |
1475 base.TOKEN__VBD__FILLER__COMMENT_BLOCK,
1476 continued: 0,
1477 length: length + 2)
1478 this.comment_type = 1
1479 return ok
1480 }
1481
1482 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
1483 if length >= 0xFFFD {
1484 args.dst.write_simple_token_fast!(
1485 value_major: 0,
1486 value_minor: (base.TOKEN__VBC__FILLER << 21) |
1487 base.TOKEN__VBD__FILLER__COMMENT_BLOCK,
1488 continued: 1,
1489 length: length + 1)
1490 while args.dst.length() <= 0,
1491 post args.dst.length() > 0,
1492 {
1493 yield? base."$short write"
1494 } endwhile
1495 length = 0
1496 continue.comment_block
1497 }
1498 length += 1
Nigel Taoeb06ed72020-03-22 21:07:12 +11001499 } endwhile.comment_block
1500
Nigel Taoe39f3cb2020-04-14 23:03:18 +10001501 } else if (c2 == '//'le) and this.quirks[QUIRK_ALLOW_COMMENT_LINE - QUIRKS_BASE] {
Nigel Tao8b70ad02020-05-27 23:28:44 +10001502 args.src.skip_u32_fast!(actual: 2, worst_case: 2)
Nigel Taoeb06ed72020-03-22 21:07:12 +11001503 length = 2
1504
Nigel Tao5a93c632022-01-25 21:24:54 +11001505 while.comment_line true,
1506 pre args.dst.length() > 0,
1507 {
Nigel Tao7b512662022-01-25 21:26:27 +11001508 if args.src.length() <= 0 {
1509 if args.src.is_closed() {
Nigel Tao3456f392020-04-02 12:26:57 +11001510 args.dst.write_simple_token_fast!(
Nigel Taoeb06ed72020-03-22 21:07:12 +11001511 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001512 value_minor: (base.TOKEN__VBC__FILLER << 21) |
1513 base.TOKEN__VBD__FILLER__COMMENT_LINE,
Nigel Tao496e88b2020-04-09 22:10:08 +10001514 continued: 0,
Nigel Tao77f85522021-07-19 00:00:13 +10001515 length: length)
Nigel Taofa50f4d2020-09-21 11:07:36 +10001516 this.comment_type = 2
Nigel Taoeb06ed72020-03-22 21:07:12 +11001517 return ok
Nigel Tao7b512662022-01-25 21:26:27 +11001518 } else if length > 0 {
Nigel Tao3456f392020-04-02 12:26:57 +11001519 args.dst.write_simple_token_fast!(
Nigel Taoeb06ed72020-03-22 21:07:12 +11001520 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001521 value_minor: (base.TOKEN__VBC__FILLER << 21) |
1522 base.TOKEN__VBD__FILLER__COMMENT_LINE,
Nigel Tao496e88b2020-04-09 22:10:08 +10001523 continued: 1,
Nigel Tao7b512662022-01-25 21:26:27 +11001524 length: length)
Nigel Taoeb06ed72020-03-22 21:07:12 +11001525 }
Nigel Tao7b512662022-01-25 21:26:27 +11001526 yield? base."$short read"
1527 while args.dst.length() <= 0,
1528 post args.dst.length() > 0,
1529 {
1530 yield? base."$short write"
1531 } endwhile
1532 length = 0
1533 continue.comment_line
1534 }
1535
1536 c = args.src.peek_u8()
1537 if c == '\n' {
1538 args.dst.write_simple_token_fast!(
1539 value_major: 0,
1540 value_minor: (base.TOKEN__VBC__FILLER << 21) |
1541 base.TOKEN__VBD__FILLER__COMMENT_LINE,
1542 continued: 0,
1543 length: length)
1544 this.comment_type = 2
1545 return ok
1546 }
1547
1548 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
1549 if length >= 0xFFFD {
1550 args.dst.write_simple_token_fast!(
1551 value_major: 0,
1552 value_minor: (base.TOKEN__VBC__FILLER << 21) |
1553 base.TOKEN__VBD__FILLER__COMMENT_LINE,
1554 continued: 1,
1555 length: length + 1)
1556 while args.dst.length() <= 0,
1557 post args.dst.length() > 0,
1558 {
1559 yield? base."$short write"
1560 } endwhile
1561 length = 0
1562 continue.comment_line
1563 }
1564 length += 1
Nigel Taoeb06ed72020-03-22 21:07:12 +11001565 } endwhile.comment_line
1566 }
Nigel Taoeb06ed72020-03-22 21:07:12 +11001567}
1568
Nigel Taod7c72372020-03-24 13:58:38 +11001569pri func decoder.decode_inf_nan?(dst: base.token_writer, src: base.io_reader) {
1570 var c4 : base.u32
1571 var neg : base.u32[..= 1]
1572
1573 while true {
Nigel Taobe8542e2020-08-13 23:26:08 +10001574 if args.dst.length() <= 0 {
Nigel Taod7c72372020-03-24 13:58:38 +11001575 yield? base."$short write"
1576 continue
1577 }
Nigel Taobe8542e2020-08-13 23:26:08 +10001578 if args.src.length() <= 2 {
Nigel Taod7c72372020-03-24 13:58:38 +11001579 if args.src.is_closed() {
1580 return "#bad input"
1581 }
1582 yield? base."$short read"
1583 continue
1584 }
1585
1586 // Bitwise or'ing with 0x20 converts upper case ASCII to lower case.
1587
1588 c4 = args.src.peek_u24le_as_u32()
Nigel Tao9e509182020-03-25 16:38:17 +11001589 if (c4 | 0x20_2020) == 'inf'le {
Nigel Taobe8542e2020-08-13 23:26:08 +10001590 if args.src.length() > 7 {
Nigel Tao9e509182020-03-25 16:38:17 +11001591 if (args.src.peek_u64le() | 0x2020_2020_2020_2020) == 'infinity'le {
Nigel Tao3456f392020-04-02 12:26:57 +11001592 args.dst.write_simple_token_fast!(
Nigel Taod7c72372020-03-24 13:58:38 +11001593 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001594 value_minor: (base.TOKEN__VBC__NUMBER << 21) |
1595 base.TOKEN__VBD__NUMBER__CONTENT_POS_INF,
Nigel Tao496e88b2020-04-09 22:10:08 +10001596 continued: 0,
Nigel Taod7c72372020-03-24 13:58:38 +11001597 length: 8)
Nigel Tao8b70ad02020-05-27 23:28:44 +10001598 args.src.skip_u32_fast!(actual: 8, worst_case: 8)
Nigel Taod7c72372020-03-24 13:58:38 +11001599 return ok
1600 }
1601 } else if not args.src.is_closed() {
1602 yield? base."$short read"
1603 continue
1604 }
Nigel Tao3456f392020-04-02 12:26:57 +11001605 args.dst.write_simple_token_fast!(
Nigel Taod7c72372020-03-24 13:58:38 +11001606 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001607 value_minor: (base.TOKEN__VBC__NUMBER << 21) |
1608 base.TOKEN__VBD__NUMBER__CONTENT_POS_INF,
Nigel Tao496e88b2020-04-09 22:10:08 +10001609 continued: 0,
Nigel Taod7c72372020-03-24 13:58:38 +11001610 length: 3)
Nigel Tao8b70ad02020-05-27 23:28:44 +10001611 args.src.skip_u32_fast!(actual: 3, worst_case: 3)
Nigel Taod7c72372020-03-24 13:58:38 +11001612 return ok
1613
Nigel Tao9e509182020-03-25 16:38:17 +11001614 } else if (c4 | 0x20_2020) == 'nan'le {
Nigel Tao3456f392020-04-02 12:26:57 +11001615 args.dst.write_simple_token_fast!(
Nigel Taod7c72372020-03-24 13:58:38 +11001616 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001617 value_minor: (base.TOKEN__VBC__NUMBER << 21) |
1618 base.TOKEN__VBD__NUMBER__CONTENT_POS_NAN,
Nigel Tao496e88b2020-04-09 22:10:08 +10001619 continued: 0,
Nigel Taod7c72372020-03-24 13:58:38 +11001620 length: 3)
Nigel Tao8b70ad02020-05-27 23:28:44 +10001621 args.src.skip_u32_fast!(actual: 3, worst_case: 3)
Nigel Taod7c72372020-03-24 13:58:38 +11001622 return ok
Nigel Tao9e509182020-03-25 16:38:17 +11001623 } else if (c4 & 0xFF) == '+' {
Nigel Taod7c72372020-03-24 13:58:38 +11001624 neg = 0
Nigel Tao9e509182020-03-25 16:38:17 +11001625 } else if (c4 & 0xFF) == '-' {
Nigel Taod7c72372020-03-24 13:58:38 +11001626 neg = 1
1627 } else {
1628 return "#bad input"
1629 }
1630
Nigel Taobe8542e2020-08-13 23:26:08 +10001631 if args.src.length() <= 3 {
Nigel Taod7c72372020-03-24 13:58:38 +11001632 if args.src.is_closed() {
1633 return "#bad input"
1634 }
1635 yield? base."$short read"
1636 continue
1637 }
1638
1639 c4 = args.src.peek_u32le() >> 8
Nigel Tao9e509182020-03-25 16:38:17 +11001640 if (c4 | 0x20_2020) == 'inf'le {
Nigel Taobe8542e2020-08-13 23:26:08 +10001641 if args.src.length() > 8 {
Nigel Tao9e509182020-03-25 16:38:17 +11001642 if (args.src.peek_u64le_at(offset: 1) | 0x2020_2020_2020_2020) == 'infinity'le {
Nigel Tao3456f392020-04-02 12:26:57 +11001643 args.dst.write_simple_token_fast!(
Nigel Taod7c72372020-03-24 13:58:38 +11001644 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001645 value_minor: (base.TOKEN__VBC__NUMBER << 21) |
Nigel Tao4e2d0c12020-06-10 10:06:58 +10001646 (base.TOKEN__VBD__NUMBER__CONTENT_POS_INF >> neg),
Nigel Tao496e88b2020-04-09 22:10:08 +10001647 continued: 0,
Nigel Taod7c72372020-03-24 13:58:38 +11001648 length: 9)
Nigel Tao8b70ad02020-05-27 23:28:44 +10001649 args.src.skip_u32_fast!(actual: 9, worst_case: 9)
Nigel Taod7c72372020-03-24 13:58:38 +11001650 return ok
1651 }
1652 } else if not args.src.is_closed() {
1653 yield? base."$short read"
1654 continue
1655 }
Nigel Tao3456f392020-04-02 12:26:57 +11001656 args.dst.write_simple_token_fast!(
Nigel Taod7c72372020-03-24 13:58:38 +11001657 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001658 value_minor: (base.TOKEN__VBC__NUMBER << 21) |
Nigel Tao4e2d0c12020-06-10 10:06:58 +10001659 (base.TOKEN__VBD__NUMBER__CONTENT_POS_INF >> neg),
Nigel Tao496e88b2020-04-09 22:10:08 +10001660 continued: 0,
Nigel Taod7c72372020-03-24 13:58:38 +11001661 length: 4)
Nigel Tao8b70ad02020-05-27 23:28:44 +10001662 args.src.skip_u32_fast!(actual: 4, worst_case: 4)
Nigel Taod7c72372020-03-24 13:58:38 +11001663 return ok
1664
Nigel Tao9e509182020-03-25 16:38:17 +11001665 } else if (c4 | 0x20_2020) == 'nan'le {
Nigel Tao3456f392020-04-02 12:26:57 +11001666 args.dst.write_simple_token_fast!(
Nigel Taod7c72372020-03-24 13:58:38 +11001667 value_major: 0,
Nigel Tao3d31ac02020-06-09 23:34:42 +10001668 value_minor: (base.TOKEN__VBC__NUMBER << 21) |
Nigel Tao4e2d0c12020-06-10 10:06:58 +10001669 (base.TOKEN__VBD__NUMBER__CONTENT_POS_NAN >> neg),
Nigel Tao496e88b2020-04-09 22:10:08 +10001670 continued: 0,
Nigel Taod7c72372020-03-24 13:58:38 +11001671 length: 4)
Nigel Tao8b70ad02020-05-27 23:28:44 +10001672 args.src.skip_u32_fast!(actual: 4, worst_case: 4)
Nigel Taod7c72372020-03-24 13:58:38 +11001673 return ok
1674 }
1675
1676 return "#bad input"
Nigel Tao58351d42020-03-25 21:38:31 +11001677 } endwhile
Nigel Taod7c72372020-03-24 13:58:38 +11001678}
1679
Nigel Taocd4cbc92020-09-22 22:22:15 +10001680pri func decoder.decode_trailer?(dst: base.token_writer, src: base.io_reader) {
Nigel Tao4b186b02020-03-18 14:25:21 +11001681 var c : base.u8
1682 var whitespace_length : base.u32[..= 0xFFFE]
1683
Nigel Taocd4cbc92020-09-22 22:22:15 +10001684 if this.quirks[QUIRK_EXPECT_TRAILING_NEW_LINE_OR_EOF - QUIRKS_BASE] {
1685 this.trailer_stop = '\n'
1686 } else {
1687 this.trailer_stop = 0
1688 }
Nigel Taofa50f4d2020-09-21 11:07:36 +10001689
Nigel Tao4b186b02020-03-18 14:25:21 +11001690 while.outer true {
Nigel Taobe8542e2020-08-13 23:26:08 +10001691 if args.dst.length() <= 0 {
Nigel Tao4b186b02020-03-18 14:25:21 +11001692 yield? base."$short write"
Nigel Tao4b186b02020-03-18 14:25:21 +11001693 continue.outer
1694 }
1695
Nigel Taob374b2b2022-01-25 20:33:44 +11001696 whitespace_length = 0
Nigel Tao4b186b02020-03-18 14:25:21 +11001697 while.inner true,
Nigel Taobe8542e2020-08-13 23:26:08 +10001698 pre args.dst.length() > 0,
Nigel Tao4b186b02020-03-18 14:25:21 +11001699 {
Nigel Taobe8542e2020-08-13 23:26:08 +10001700 if args.src.length() <= 0 {
Nigel Tao4b186b02020-03-18 14:25:21 +11001701 if whitespace_length > 0 {
Nigel Tao3456f392020-04-02 12:26:57 +11001702 args.dst.write_simple_token_fast!(
Nigel Tao496e88b2020-04-09 22:10:08 +10001703 value_major: 0, value_minor: 0, continued: 0, length: whitespace_length)
Nigel Tao4b186b02020-03-18 14:25:21 +11001704 }
1705 if args.src.is_closed() {
1706 break.outer
1707 }
1708 yield? base."$short read"
Nigel Tao4b186b02020-03-18 14:25:21 +11001709 continue.outer
1710 }
1711
1712 c = args.src.peek_u8()
Nigel Tao80071732020-04-13 16:06:16 +10001713 if LUT_CLASSES[c] <> CLASS_WHITESPACE {
Nigel Tao4b186b02020-03-18 14:25:21 +11001714 if whitespace_length > 0 {
Nigel Tao3456f392020-04-02 12:26:57 +11001715 args.dst.write_simple_token_fast!(
Nigel Tao496e88b2020-04-09 22:10:08 +10001716 value_major: 0, value_minor: 0, continued: 0, length: whitespace_length)
Nigel Tao4b186b02020-03-18 14:25:21 +11001717 }
Nigel Taocd4cbc92020-09-22 22:22:15 +10001718 if this.trailer_stop > 0 {
1719 return "#bad input"
Nigel Taofa50f4d2020-09-21 11:07:36 +10001720 }
Nigel Taocd4cbc92020-09-22 22:22:15 +10001721 this.decode_comment?(dst: args.dst, src: args.src)
Nigel Taocd4cbc92020-09-22 22:22:15 +10001722 if this.comment_type > 0 {
1723 continue.outer
1724 }
1725 return ok
Nigel Tao4b186b02020-03-18 14:25:21 +11001726 }
1727
Nigel Tao8b70ad02020-05-27 23:28:44 +10001728 args.src.skip_u32_fast!(actual: 1, worst_case: 1)
Nigel Taocd4cbc92020-09-22 22:22:15 +10001729 if (whitespace_length >= 0xFFFE) or (c == this.trailer_stop) {
Nigel Tao3456f392020-04-02 12:26:57 +11001730 args.dst.write_simple_token_fast!(
Nigel Tao496e88b2020-04-09 22:10:08 +10001731 value_major: 0, value_minor: 0, continued: 0, length: whitespace_length + 1)
Nigel Taocd4cbc92020-09-22 22:22:15 +10001732 if c == this.trailer_stop {
1733 return ok
Nigel Tao4b186b02020-03-18 14:25:21 +11001734 }
1735 continue.outer
1736 }
1737 whitespace_length += 1
1738 } endwhile.inner
1739 } endwhile.outer
Nigel Tao4b186b02020-03-18 14:25:21 +11001740}