blob: c3531cab7122eb40d18e4b68ab553690891c284c [file] [log] [blame]
Nigel Tao1b073492020-02-16 22:11:36 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
18jsonptr is a JSON formatter (pretty-printer).
19
Nigel Taoc5b3a9e2020-02-24 11:54:35 +110020As of 2020-02-24, this program passes all 318 "test_parsing" cases from the
21JSON test suite (https://github.com/nst/JSONTestSuite), an appendix to the
22"Parsing JSON is a Minefield" article (http://seriot.ch/parsing_json.php) that
23was first published on 2016-10-26 and updated on 2018-03-30.
24
Nigel Tao1b073492020-02-16 22:11:36 +110025This example program differs from most other example Wuffs programs in that it
26is written in C++, not C.
27
28$CXX jsonptr.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
29
30for a C++ compiler $CXX, such as clang++ or g++.
Nigel Tao569a2942020-02-23 23:13:51 +110031
32After modifying this program, run "build-example.sh example/jsonptr/" and then
33"script/run-json-test-suite.sh" to catch correctness regressions.
Nigel Tao1b073492020-02-16 22:11:36 +110034*/
35
36#include <inttypes.h>
37#include <stdio.h>
Nigel Tao9cc2c252020-02-23 17:05:49 +110038#include <string.h>
Nigel Tao1b073492020-02-16 22:11:36 +110039
40// Wuffs ships as a "single file C library" or "header file library" as per
41// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
42//
43// To use that single file as a "foo.c"-like implementation, instead of a
44// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
45// compiling it.
46#define WUFFS_IMPLEMENTATION
47
48// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
49// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
50// the entire Wuffs standard library, implementing a variety of codecs and file
51// formats. Without this macro definition, an optimizing compiler or linker may
52// very well discard Wuffs code for unused codecs, but listing the Wuffs
53// modules we use makes that process explicit. Preprocessing means that such
54// code simply isn't compiled.
55#define WUFFS_CONFIG__MODULES
56#define WUFFS_CONFIG__MODULE__BASE
57#define WUFFS_CONFIG__MODULE__JSON
58
59// If building this program in an environment that doesn't easily accommodate
60// relative includes, you can use the script/inline-c-relative-includes.go
61// program to generate a stand-alone C++ file.
62#include "../../release/c/wuffs-unsupported-snapshot.c"
63
Nigel Tao2cf76db2020-02-27 22:42:01 +110064#define TRY(error_msg) \
65 do { \
66 const char* z = error_msg; \
67 if (z) { \
68 return z; \
69 } \
70 } while (false)
71
72static const char* eod = "main: end of data";
73
74// ----
75
76#define MAX_INDENT 8
77#define INDENT_STRING " "
78size_t indent;
79
Nigel Tao1b073492020-02-16 22:11:36 +110080#ifndef DST_BUFFER_SIZE
81#define DST_BUFFER_SIZE (32 * 1024)
82#endif
83#ifndef SRC_BUFFER_SIZE
84#define SRC_BUFFER_SIZE (32 * 1024)
85#endif
86#ifndef TOKEN_BUFFER_SIZE
87#define TOKEN_BUFFER_SIZE (4 * 1024)
88#endif
89
Nigel Tao2cf76db2020-02-27 22:42:01 +110090uint8_t dst_array[DST_BUFFER_SIZE];
91uint8_t src_array[SRC_BUFFER_SIZE];
92wuffs_base__token tok_array[TOKEN_BUFFER_SIZE];
Nigel Tao1b073492020-02-16 22:11:36 +110093
94wuffs_base__io_buffer dst;
95wuffs_base__io_buffer src;
96wuffs_base__token_buffer tok;
97
Nigel Tao2cf76db2020-02-27 22:42:01 +110098// curr_token_end_src_index is the src.data.ptr index of the end of the current
99// token. An invariant is that (curr_token_end_src_index <= src.meta.ri).
100size_t curr_token_end_src_index;
101
102bool prev_token_incomplete;
103
104uint64_t depth;
105
106enum class context {
107 none,
108 in_list_after_bracket,
109 in_list_after_value,
110 in_dict_after_brace,
111 in_dict_after_key,
112 in_dict_after_value,
113} ctx;
114
Nigel Tao1b073492020-02-16 22:11:36 +1100115wuffs_json__decoder dec;
Nigel Tao1b073492020-02-16 22:11:36 +1100116
Nigel Tao2cf76db2020-02-27 22:42:01 +1100117const char* //
118initialize_globals(int argc, char** argv) {
119 indent = 4;
Nigel Tao1b073492020-02-16 22:11:36 +1100120
Nigel Tao2cf76db2020-02-27 22:42:01 +1100121 dst = wuffs_base__make_io_buffer(
122 wuffs_base__make_slice_u8(dst_array, DST_BUFFER_SIZE),
123 wuffs_base__empty_io_buffer_meta());
Nigel Tao1b073492020-02-16 22:11:36 +1100124
Nigel Tao2cf76db2020-02-27 22:42:01 +1100125 src = wuffs_base__make_io_buffer(
126 wuffs_base__make_slice_u8(src_array, SRC_BUFFER_SIZE),
127 wuffs_base__empty_io_buffer_meta());
128
129 tok = wuffs_base__make_token_buffer(
130 wuffs_base__make_slice_token(tok_array, TOKEN_BUFFER_SIZE),
131 wuffs_base__empty_token_buffer_meta());
132
133 curr_token_end_src_index = 0;
134
135 prev_token_incomplete = false;
136
137 depth = 0;
138
139 ctx = context::none;
140
141 return dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0)
142 .message();
143}
Nigel Tao1b073492020-02-16 22:11:36 +1100144
145// ----
146
Nigel Tao2914bae2020-02-26 09:40:30 +1100147const char* //
148read_src() {
Nigel Taoa8406922020-02-19 12:22:00 +1100149 if (src.meta.closed) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100150 return "main: internal error: read requested on a closed source";
Nigel Taoa8406922020-02-19 12:22:00 +1100151 }
Nigel Tao1b073492020-02-16 22:11:36 +1100152 src.compact();
153 if (src.meta.wi >= src.data.len) {
154 return "main: src buffer is full";
155 }
156 size_t n = fread(src.data.ptr + src.meta.wi, sizeof(uint8_t),
157 src.data.len - src.meta.wi, stdin);
158 src.meta.wi += n;
Nigel Tao67306562020-02-19 14:04:49 +1100159 src.meta.closed = feof(stdin);
160 if ((n == 0) && !src.meta.closed) {
Nigel Taoa8406922020-02-19 12:22:00 +1100161 return "main: read error";
Nigel Tao1b073492020-02-16 22:11:36 +1100162 }
163 return nullptr;
164}
165
Nigel Tao2914bae2020-02-26 09:40:30 +1100166const char* //
167flush_dst() {
Nigel Tao1b073492020-02-16 22:11:36 +1100168 size_t n = dst.meta.wi - dst.meta.ri;
169 if (n > 0) {
170 size_t i = fwrite(dst.data.ptr + dst.meta.ri, sizeof(uint8_t), n, stdout);
171 dst.meta.ri += i;
172 if (i != n) {
173 return "main: write error";
174 }
175 dst.compact();
176 }
177 return nullptr;
178}
179
Nigel Tao2914bae2020-02-26 09:40:30 +1100180const char* //
181write_dst(const void* s, size_t n) {
Nigel Tao1b073492020-02-16 22:11:36 +1100182 const uint8_t* p = static_cast<const uint8_t*>(s);
183 while (n > 0) {
184 size_t i = dst.writer_available();
185 if (i == 0) {
186 const char* z = flush_dst();
187 if (z) {
188 return z;
189 }
190 i = dst.writer_available();
191 if (i == 0) {
192 return "main: dst buffer is full";
193 }
194 }
195
196 if (i > n) {
197 i = n;
198 }
199 memcpy(dst.data.ptr + dst.meta.wi, p, i);
200 dst.meta.wi += i;
201 p += i;
202 n -= i;
203 }
204 return nullptr;
205}
206
207// ----
208
Nigel Tao2914bae2020-02-26 09:40:30 +1100209uint8_t //
210hex_digit(uint8_t nibble) {
Nigel Taob5461bd2020-02-21 14:13:37 +1100211 nibble &= 0x0F;
212 if (nibble <= 9) {
213 return '0' + nibble;
214 }
215 return ('A' - 10) + nibble;
216}
217
Nigel Tao2914bae2020-02-26 09:40:30 +1100218const char* //
Nigel Tao3b486982020-02-27 15:05:59 +1100219handle_unicode_code_point(uint32_t ucp) {
220 if (ucp < 0x0020) {
221 switch (ucp) {
222 case '\b':
223 return write_dst("\\b", 2);
224 case '\f':
225 return write_dst("\\f", 2);
226 case '\n':
227 return write_dst("\\n", 2);
228 case '\r':
229 return write_dst("\\r", 2);
230 case '\t':
231 return write_dst("\\t", 2);
232 default: {
233 // Other bytes less than 0x0020 are valid UTF-8 but not valid in a
234 // JSON string. They need to remain escaped.
235 uint8_t esc6[6];
236 esc6[0] = '\\';
237 esc6[1] = 'u';
238 esc6[2] = '0';
239 esc6[3] = '0';
240 esc6[4] = hex_digit(ucp >> 4);
241 esc6[5] = hex_digit(ucp >> 0);
242 return write_dst(&esc6[0], 6);
243 }
244 }
245
246 } else if (ucp <= 0x007F) {
247 switch (ucp) {
248 case '\"':
249 return write_dst("\\\"", 2);
250 case '\\':
251 return write_dst("\\\\", 2);
252 default: {
253 // The UTF-8 encoding takes 1 byte.
254 uint8_t esc0 = (uint8_t)(ucp);
255 return write_dst(&esc0, 1);
256 }
257 }
258
259 } else if (ucp <= 0x07FF) {
260 // The UTF-8 encoding takes 2 bytes.
261 uint8_t esc2[2];
262 esc2[0] = 0xC0 | (uint8_t)((ucp >> 6));
263 esc2[1] = 0x80 | (uint8_t)((ucp >> 0) & 0x3F);
264 return write_dst(&esc2[0], 2);
265
266 } else if (ucp <= 0xFFFF) {
267 if ((0xD800 <= ucp) && (ucp <= 0xDFFF)) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100268 return "main: internal error: unexpected Unicode surrogate";
Nigel Tao3b486982020-02-27 15:05:59 +1100269 }
270 // The UTF-8 encoding takes 3 bytes.
271 uint8_t esc3[3];
272 esc3[0] = 0xE0 | (uint8_t)((ucp >> 12));
273 esc3[1] = 0x80 | (uint8_t)((ucp >> 6) & 0x3F);
274 esc3[2] = 0x80 | (uint8_t)((ucp >> 0) & 0x3F);
275 return write_dst(&esc3[0], 3);
276
277 } else if (ucp <= 0x10FFFF) {
278 // The UTF-8 encoding takes 4 bytes.
279 uint8_t esc4[4];
280 esc4[0] = 0xF0 | (uint8_t)((ucp >> 18));
281 esc4[1] = 0x80 | (uint8_t)((ucp >> 12) & 0x3F);
282 esc4[2] = 0x80 | (uint8_t)((ucp >> 6) & 0x3F);
283 esc4[3] = 0x80 | (uint8_t)((ucp >> 0) & 0x3F);
284 return write_dst(&esc4[0], 4);
285 }
286
Nigel Tao2cf76db2020-02-27 22:42:01 +1100287 return "main: internal error: unexpected Unicode code point";
Nigel Tao3b486982020-02-27 15:05:59 +1100288}
289
290const char* //
Nigel Tao2cf76db2020-02-27 22:42:01 +1100291handle_token(wuffs_base__token t) {
292 do {
293 uint64_t vbc = t.value_base_category();
294 uint64_t vbd = t.value_base_detail();
295 uint64_t len = t.length();
Nigel Tao1b073492020-02-16 22:11:36 +1100296
297 // Handle ']' or '}'.
Nigel Tao9f7a2502020-02-23 09:42:02 +1100298 if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
Nigel Tao2cf76db2020-02-27 22:42:01 +1100299 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
Nigel Tao1b073492020-02-16 22:11:36 +1100300 if (depth <= 0) {
301 return "main: internal error: inconsistent depth";
302 }
303 depth--;
304
305 // Write preceding whitespace.
306 if ((ctx != context::in_list_after_bracket) &&
307 (ctx != context::in_dict_after_brace)) {
308 TRY(write_dst("\n", 1));
309 for (size_t i = 0; i < depth; i++) {
310 TRY(write_dst(INDENT_STRING, indent));
311 }
312 }
313
Nigel Tao9f7a2502020-02-23 09:42:02 +1100314 TRY(write_dst(
315 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST) ? "]" : "}", 1));
316 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
317 ? context::in_list_after_value
318 : context::in_dict_after_key;
Nigel Tao1b073492020-02-16 22:11:36 +1100319 goto after_value;
320 }
321
322 // Write preceding whitespace and punctuation, if it wasn't ']' or '}'.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100323 if (!prev_token_incomplete) {
324 if (ctx == context::in_dict_after_key) {
325 TRY(write_dst(": ", 2));
326 } else if (ctx != context::none) {
327 if ((ctx != context::in_list_after_bracket) &&
328 (ctx != context::in_dict_after_brace)) {
329 TRY(write_dst(",", 1));
330 }
331 TRY(write_dst("\n", 1));
332 for (size_t i = 0; i < depth; i++) {
333 TRY(write_dst(INDENT_STRING, indent));
334 }
Nigel Tao1b073492020-02-16 22:11:36 +1100335 }
336 }
337
338 // Handle the token itself: either a container ('[' or '{') or a simple
339 // value (number, string or literal).
340 switch (vbc) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100341 case WUFFS_BASE__TOKEN__VBC__STRING:
342 if (!prev_token_incomplete) {
343 TRY(write_dst("\"", 1));
344 }
Nigel Taocb37a562020-02-28 09:56:24 +1100345
346 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
347 // No-op.
348 } else if (vbd &
349 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
350 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
351 } else {
352 return "main: internal error: unexpected string-token conversion";
353 }
354
Nigel Tao2cf76db2020-02-27 22:42:01 +1100355 prev_token_incomplete =
356 vbd & WUFFS_BASE__TOKEN__VBD__STRING__INCOMPLETE;
357 if (prev_token_incomplete) {
358 return nullptr;
359 }
360 TRY(write_dst("\"", 1));
361 goto after_value;
362
363 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT:
364 return handle_unicode_code_point(vbd);
365
366 case WUFFS_BASE__TOKEN__VBC__NUMBER:
367 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
368 goto after_value;
369
Nigel Tao9f7a2502020-02-23 09:42:02 +1100370 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
371 TRY(write_dst(
372 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) ? "[" : "{", 1));
Nigel Tao1b073492020-02-16 22:11:36 +1100373 depth++;
Nigel Tao9f7a2502020-02-23 09:42:02 +1100374 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
375 ? context::in_list_after_bracket
376 : context::in_dict_after_brace;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100377 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +1100378 }
379
380 // Return an error if we didn't match the (vbc, vbd) pair.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100381 return "main: internal error: unexpected token";
382 } while (0);
Nigel Tao1b073492020-02-16 22:11:36 +1100383
Nigel Tao2cf76db2020-02-27 22:42:01 +1100384 // Book-keeping after completing a value (whether a container value or a
385 // simple value). Empty parent containers are no longer empty. If the parent
386 // container is a "{...}" object, toggle between keys and values.
387after_value:
388 if (depth == 0) {
389 return eod;
390 }
391 switch (ctx) {
392 case context::in_list_after_bracket:
393 ctx = context::in_list_after_value;
394 break;
395 case context::in_dict_after_brace:
396 ctx = context::in_dict_after_key;
397 break;
398 case context::in_dict_after_key:
399 ctx = context::in_dict_after_value;
400 break;
401 case context::in_dict_after_value:
402 ctx = context::in_dict_after_key;
403 break;
404 }
405 return nullptr;
406}
407
408const char* //
409main1(int argc, char** argv) {
410 TRY(initialize_globals(argc, argv));
411
412 while (true) {
413 wuffs_base__status status = dec.decode_tokens(&tok, &src);
414
415 while (tok.meta.ri < tok.meta.wi) {
416 wuffs_base__token t = tok.data.ptr[tok.meta.ri++];
417 uint64_t n = t.length();
418 if ((src.meta.ri - curr_token_end_src_index) < n) {
419 return "main: internal error: inconsistent src indexes";
420 }
421 curr_token_end_src_index += n;
422
423 if (t.value() == 0) {
424 continue;
425 }
426
427 const char* z = handle_token(t);
428 if (z == nullptr) {
429 continue;
430 } else if (z == eod) {
431 break;
432 }
433 return z;
Nigel Tao1b073492020-02-16 22:11:36 +1100434 }
Nigel Tao2cf76db2020-02-27 22:42:01 +1100435
436 if (status.repr == nullptr) {
437 break;
438 } else if (status.repr == wuffs_base__suspension__short_read) {
439 if (curr_token_end_src_index != src.meta.ri) {
440 return "main: internal error: inconsistent src indexes";
441 }
442 TRY(read_src());
443 curr_token_end_src_index = src.meta.ri;
444 } else if (status.repr == wuffs_base__suspension__short_write) {
445 tok.compact();
446 } else {
447 return status.message();
Nigel Tao1b073492020-02-16 22:11:36 +1100448 }
449 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100450
Nigel Tao6b161af2020-02-24 11:01:48 +1100451 // Consume an optional whitespace trailer. This isn't part of the JSON spec,
452 // but it works better with line oriented Unix tools (such as "echo 123 |
453 // jsonptr" where it's "echo", not "echo -n") or hand-edited JSON files which
454 // can accidentally contain trailing whitespace.
455 //
456 // A whitespace trailer is zero or more ' ' and then zero or one '\n'.
457 while (true) {
458 if (src.meta.ri < src.meta.wi) {
459 uint8_t c = src.data.ptr[src.meta.ri];
460 if (c == ' ') {
461 src.meta.ri++;
462 continue;
463 } else if (c == '\n') {
464 src.meta.ri++;
465 break;
466 }
467 // The "exhausted the input" check below will fail.
468 break;
469 } else if (src.meta.closed) {
470 break;
471 }
472 TRY(read_src());
473 }
474
475 // Check that we've exhausted the input.
476 if ((src.meta.ri < src.meta.wi) || !src.meta.closed) {
477 return "main: valid JSON followed by further (unexpected) data";
478 }
479
480 // Check that we've used all of the decoded tokens, other than trailing
481 // filler tokens. For example, a bare `"foo"` string is valid JSON, but even
482 // without a trailing '\n', the Wuffs JSON parser emits a filler token for
483 // the final '\"'.
484 for (; tok.meta.ri < tok.meta.wi; tok.meta.ri++) {
485 if (tok.data.ptr[tok.meta.ri].value_base_category() !=
486 WUFFS_BASE__TOKEN__VBC__FILLER) {
487 return "main: internal error: decoded OK but unprocessed tokens remain";
488 }
489 }
490
491 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +1100492}
493
Nigel Tao2914bae2020-02-26 09:40:30 +1100494int //
495compute_exit_code(const char* status_msg) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100496 if (!status_msg) {
497 return 0;
498 }
499 size_t n = strnlen(status_msg, 2047);
500 if (n >= 2047) {
501 status_msg = "main: internal error: error message is too long";
502 n = strnlen(status_msg, 2047);
503 }
504 fprintf(stderr, "%s\n", status_msg);
505 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
506 // formatted or unsupported input.
507 //
508 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
509 // run-time checks found that an internal invariant did not hold.
510 //
511 // Automated testing, including badly formatted inputs, can therefore
512 // discriminate between expected failure (exit code 1) and unexpected failure
513 // (other non-zero exit codes). Specifically, exit code 2 for internal
514 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
515 // linux) for a segmentation fault (e.g. null pointer dereference).
516 return strstr(status_msg, "internal error:") ? 2 : 1;
517}
518
Nigel Tao2914bae2020-02-26 09:40:30 +1100519int //
520main(int argc, char** argv) {
Nigel Tao1b073492020-02-16 22:11:36 +1100521 const char* z0 = main1(argc, argv);
Nigel Tao2cf76db2020-02-27 22:42:01 +1100522 const char* z1 = write_dst("\n", 1);
523 const char* z2 = flush_dst();
524 int exit_code = compute_exit_code(z0 ? z0 : (z1 ? z1 : z2));
Nigel Tao9cc2c252020-02-23 17:05:49 +1100525 return exit_code;
Nigel Tao1b073492020-02-16 22:11:36 +1100526}