blob: 8bae0fcbcce1b87e12cde032fb18adfdf21ed36c [file] [log] [blame]
Nigel Tao1b073492020-02-16 22:11:36 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
Nigel Tao0cd2f982020-03-03 23:03:02 +110018jsonptr is a JSON formatter (pretty-printer) that supports the JSON Pointer
19(RFC 6901) query syntax. It reads UTF-8 JSON from stdin and writes
20canonicalized, formatted UTF-8 JSON to stdout.
21
22See the "const char* usage" string below for details.
23
24----
25
26JSON Pointer (and this program's implementation) is one of many JSON query
27languages and JSON tools, such as jq, jql and JMESPath. This one is relatively
28simple and fewer-featured compared to those others.
29
30One benefit of simplicity is that this program's JSON and JSON Pointer
31implementations do not dynamically allocate or free memory (yet it does not
32require that the entire input fits in memory at once). They are therefore
33trivially protected against certain bug classes: memory leaks, double-frees and
34use-after-frees.
35
36The core JSON implementation is also written in the Wuffs programming language
37(and then transpiled to C/C++), which is memory-safe but also guards against
38integer arithmetic overflows.
39
40All together, this program aims to safely handle untrusted JSON files without
41fear of security bugs such as remote code execution.
42
43----
Nigel Tao1b073492020-02-16 22:11:36 +110044
Nigel Taoc5b3a9e2020-02-24 11:54:35 +110045As of 2020-02-24, this program passes all 318 "test_parsing" cases from the
46JSON test suite (https://github.com/nst/JSONTestSuite), an appendix to the
47"Parsing JSON is a Minefield" article (http://seriot.ch/parsing_json.php) that
48was first published on 2016-10-26 and updated on 2018-03-30.
49
Nigel Tao0cd2f982020-03-03 23:03:02 +110050After modifying this program, run "build-example.sh example/jsonptr/" and then
51"script/run-json-test-suite.sh" to catch correctness regressions.
52
53----
54
Nigel Tao1b073492020-02-16 22:11:36 +110055This example program differs from most other example Wuffs programs in that it
56is written in C++, not C.
57
58$CXX jsonptr.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
59
60for a C++ compiler $CXX, such as clang++ or g++.
61*/
62
63#include <inttypes.h>
64#include <stdio.h>
Nigel Tao9cc2c252020-02-23 17:05:49 +110065#include <string.h>
Nigel Tao1b073492020-02-16 22:11:36 +110066
67// Wuffs ships as a "single file C library" or "header file library" as per
68// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
69//
70// To use that single file as a "foo.c"-like implementation, instead of a
71// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
72// compiling it.
73#define WUFFS_IMPLEMENTATION
74
75// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
76// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
77// the entire Wuffs standard library, implementing a variety of codecs and file
78// formats. Without this macro definition, an optimizing compiler or linker may
79// very well discard Wuffs code for unused codecs, but listing the Wuffs
80// modules we use makes that process explicit. Preprocessing means that such
81// code simply isn't compiled.
82#define WUFFS_CONFIG__MODULES
83#define WUFFS_CONFIG__MODULE__BASE
84#define WUFFS_CONFIG__MODULE__JSON
85
86// If building this program in an environment that doesn't easily accommodate
87// relative includes, you can use the script/inline-c-relative-includes.go
88// program to generate a stand-alone C++ file.
89#include "../../release/c/wuffs-unsupported-snapshot.c"
90
Nigel Tao2cf76db2020-02-27 22:42:01 +110091#define TRY(error_msg) \
92 do { \
93 const char* z = error_msg; \
94 if (z) { \
95 return z; \
96 } \
97 } while (false)
98
99static const char* eod = "main: end of data";
100
Nigel Tao0cd2f982020-03-03 23:03:02 +1100101static const char* usage =
102 "Usage: jsonptr -flags < input.json\n"
103 "\n"
104 "Note the \"<\". It only reads from stdin, not named files.\n"
105 "\n"
106 "jsonptr is a JSON formatter (pretty-printer) that supports the JSON\n"
107 "Pointer (RFC 6901) query syntax. It reads UTF-8 JSON from stdin and\n"
108 "writes canonicalized, formatted UTF-8 JSON to stdout.\n"
109 "\n"
110 "Canonicalized means that e.g. \"abc\\u000A\\tx\\u0177z\" is re-written\n"
111 "as \"abc\\n\\txŷz\". It does not sort object keys, nor does it reject\n"
112 "duplicate keys.\n"
113 "\n"
114 "Formatted means that arrays' and objects' elements are indented, each\n"
115 "on its own line. Configure this with the -compact, -indent=N (for N\n"
116 "ranging from 0 to 8) and -tabs flags.\n"
117 "\n"
118 "The -query=etc flag gives an optional JSON Pointer query, to print only\n"
119 "a subset of the input. For example, given RFC 6901 section 5's [sample\n"
120 "JSON value](https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
121 " jsonptr -query=/foo/1 < rfc-6901-json-pointer.json\n"
122 "will print:\n"
123 " \"baz\"\n"
124 "\n"
125 "An absent query is equivalent to the empty query, which identifies the\n"
126 "entire input (the root value). The \"/\" query is not equivalent to the\n"
127 "root value. Instead, it identifies the child (the key-value pair) of the\n"
128 "root value whose key is the empty string.\n"
129 "\n"
130 "If the query found a valid JSON value, this program will return a zero\n"
131 "exit code even if the rest of the input isn't valid JSON. If the query\n"
132 "did not find a value, or found an invalid one, this program returns a\n"
133 "non-zero exit code, but may still print partial output to stdout.\n"
134 "\n"
135 "The [JSON specification](https://json.org/) permits implementations that\n"
136 "allow duplicate keys, as this one does. This JSON Pointer implementation\n"
137 "is also greedy, following the first match for each fragment without\n"
138 "back-tracking. For example, the \"/foo/bar\" query will fail if the root\n"
139 "object has multiple \"foo\" children but the first one doesn't have a\n"
140 "\"bar\" child, even if later ones do.";
141
Nigel Tao2cf76db2020-02-27 22:42:01 +1100142// ----
143
144#define MAX_INDENT 8
Nigel Tao107f0ef2020-03-01 21:35:02 +1100145#define INDENT_SPACES_STRING " "
146#define INDENT_TABS_STRING "\t\t\t\t\t\t\t\t"
147
Nigel Tao1b073492020-02-16 22:11:36 +1100148#ifndef DST_BUFFER_SIZE
149#define DST_BUFFER_SIZE (32 * 1024)
150#endif
151#ifndef SRC_BUFFER_SIZE
152#define SRC_BUFFER_SIZE (32 * 1024)
153#endif
154#ifndef TOKEN_BUFFER_SIZE
155#define TOKEN_BUFFER_SIZE (4 * 1024)
156#endif
157
Nigel Tao2cf76db2020-02-27 22:42:01 +1100158uint8_t dst_array[DST_BUFFER_SIZE];
159uint8_t src_array[SRC_BUFFER_SIZE];
160wuffs_base__token tok_array[TOKEN_BUFFER_SIZE];
Nigel Tao1b073492020-02-16 22:11:36 +1100161
162wuffs_base__io_buffer dst;
163wuffs_base__io_buffer src;
164wuffs_base__token_buffer tok;
165
Nigel Tao2cf76db2020-02-27 22:42:01 +1100166// curr_token_end_src_index is the src.data.ptr index of the end of the current
167// token. An invariant is that (curr_token_end_src_index <= src.meta.ri).
168size_t curr_token_end_src_index;
169
Nigel Tao0cd2f982020-03-03 23:03:02 +1100170uint32_t depth;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100171
172enum class context {
173 none,
174 in_list_after_bracket,
175 in_list_after_value,
176 in_dict_after_brace,
177 in_dict_after_key,
178 in_dict_after_value,
179} ctx;
180
Nigel Tao0cd2f982020-03-03 23:03:02 +1100181bool //
182in_dict_before_key() {
183 return (ctx == context::in_dict_after_brace) ||
184 (ctx == context::in_dict_after_value);
185}
186
187bool suppress_write_dst;
188bool wrote_to_dst;
189
Nigel Tao1b073492020-02-16 22:11:36 +1100190wuffs_json__decoder dec;
Nigel Tao1b073492020-02-16 22:11:36 +1100191
Nigel Tao0cd2f982020-03-03 23:03:02 +1100192// ----
193
194// Query is a JSON Pointer query. After initializing with a NUL-terminated C
195// string, its multiple fragments are consumed as the program walks the JSON
196// data from stdin. For example, letting "$" denote a NUL, suppose that we
197// started with a query string of "/apple/banana/12/durian" and are currently
198// trying to match the second fragment, "banana", so that Query::depth is 2:
199//
200// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
201// / a p p l e / b a n a n a / 1 2 / d u r i a n $
202// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
203// ^ ^
204// frag_i frag_k
205//
206// The two pointers frag_i and frag_k are the start (inclusive) and end
207// (exclusive) of the fragment. They satisfy (frag_i <= frag_k) and may be
208// equal if the fragment empty (note that "" is a valid JSON object key).
209//
210// The frag_j pointer moves between these two, or is nullptr. An invariant is
211// that (((frag_i <= frag_j) && (frag_j <= frag_k)) || (frag_j == nullptr)).
212//
213// Wuffs' JSON tokenizer can portray a single JSON string as multiple Wuffs
214// tokens, as backslash-escaped values within that JSON string may each get
215// their own token.
216//
217// At the start of each object key (a JSON string), frag_j is set to frag_i.
218//
219// While frag_j remains non-nullptr, each token's unescaped contents are then
220// compared to that part of the fragment from frag_j to frag_k. If it is a
221// prefix (including the case of an exact match), then frag_j is advanced by
222// the unescaped length. Otherwise, frag_j is set to nullptr.
223//
224// Comparison accounts for JSON Pointer's escaping notation: "~0" and "~1" in
225// the query (not the JSON value) are unescaped to "~" and "/" respectively.
226//
227// The frag_j pointer therefore advances from frag_i to frag_k, or drops out,
228// as we incrementally match the object key with the query fragment. For
229// example, if we have already matched the "ban" of "banana", then we would
230// accept any of an "ana" token, an "a" token or a "\u0061" token, amongst
231// others. They would advance frag_j by 3, 1 or 1 bytes respectively.
232//
233// frag_j
234// v
235// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
236// / a p p l e / b a n a n a / 1 2 / d u r i a n $
237// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
238// ^ ^
239// frag_i frag_k
240//
241// At the end of each object key (or equivalently, at the start of each object
242// value), if frag_j is non-nullptr and equal to (but not less than) frag_k
243// then we have a fragment match: the query fragment equals the object key. If
244// there is a next fragment (in this example, "12") we move the frag_etc
245// pointers to its start and end and increment Query::depth. Otherwise, we have
246// matched the complete query, and the upcoming JSON value is the result of
247// that query.
248//
249// The discussion above centers on object keys. If the query fragment is
250// numeric then it can also match as an array index: the string fragment "12"
251// will match an array's 13th element (starting counting from zero). See RFC
252// 6901 for its precise definition of an "array index" number.
253//
254// Array index fragment match is represented by the Query::array_index field,
255// whose type (wuffs_base__result_u64) is a result type. An error result means
256// that the fragment is not an array index. A value result holds the number of
257// list elements remaining. When matching a query fragment in an array (instead
258// of in an object), each element ticks this number down towards zero. At zero,
259// the upcoming JSON value is the one that matches the query fragment.
260class Query {
261 private:
262 uint8_t* frag_i;
263 uint8_t* frag_j;
264 uint8_t* frag_k;
265
266 uint32_t depth;
267
268 wuffs_base__result_u64 array_index;
269
270 public:
271 void reset(char* query_c_string) {
272 this->frag_i = (uint8_t*)query_c_string;
273 this->frag_j = (uint8_t*)query_c_string;
274 this->frag_k = (uint8_t*)query_c_string;
275 this->depth = 0;
276 this->array_index.status.repr = "#main: not an array index query fragment";
277 this->array_index.value = 0;
278 }
279
280 void restart_fragment(bool enable) {
281 this->frag_j = enable ? this->frag_i : nullptr;
282 }
283
284 bool is_at(uint32_t depth) { return this->depth == depth; }
285
286 // tick returns whether the fragment is a valid array index whose value is
287 // zero. If valid but non-zero, it decrements it and returns false.
288 bool tick() {
289 if (this->array_index.status.is_ok()) {
290 if (this->array_index.value == 0) {
291 return true;
292 }
293 this->array_index.value--;
294 }
295 return false;
296 }
297
298 // next_fragment moves to the next fragment, returning whether it existed.
299 bool next_fragment() {
300 uint8_t* k = this->frag_k;
301 uint32_t d = this->depth;
302
303 this->reset(nullptr);
304
305 if (!k || (*k != '/')) {
306 return false;
307 }
308 k++;
309
310 bool all_digits = true;
311 uint8_t* i = k;
312 while ((*k != '\x00') && (*k != '/')) {
313 all_digits = all_digits && ('0' <= *k) && (*k <= '9');
314 k++;
315 }
316 this->frag_i = i;
317 this->frag_j = i;
318 this->frag_k = k;
319 this->depth = d + 1;
320 if (all_digits) {
321 // wuffs_base__parse_number_u64 rejects leading zeroes, e.g. "00", "07".
322 this->array_index =
323 wuffs_base__parse_number_u64(wuffs_base__make_slice_u8(i, k - i));
324 }
325 return true;
326 }
327
328 bool matched() { return this->frag_j && (this->frag_j == this->frag_k); }
329
330 void incremental_match_slice(uint8_t* ptr, size_t len) {
331 if (!this->frag_j) {
332 return;
333 }
334 uint8_t* j = this->frag_j;
335 while (true) {
336 if (len == 0) {
337 this->frag_j = j;
338 return;
339 }
340
341 if (*j == '\x00') {
342 break;
343
344 } else if (*j == '~') {
345 j++;
346 if (*j == '0') {
347 if (*ptr != '~') {
348 break;
349 }
350 } else if (*j == '1') {
351 if (*ptr != '/') {
352 break;
353 }
354 } else {
355 break;
356 }
357
358 } else if (*j != *ptr) {
359 break;
360 }
361
362 j++;
363 ptr++;
364 len--;
365 }
366 this->frag_j = nullptr;
367 }
368
369 void incremental_match_code_point(uint32_t code_point) {
370 if (!this->frag_j) {
371 return;
372 }
373 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
374 size_t n = wuffs_base__utf_8__encode(
375 wuffs_base__make_slice_u8(&u[0],
376 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
377 code_point);
378 if (n > 0) {
379 this->incremental_match_slice(&u[0], n);
380 }
381 }
382
383 // validate returns whether the (ptr, len) arguments form a valid JSON
384 // Pointer. In particular, it must be valid UTF-8, and either be empty or
385 // start with a '/'. Any '~' within must immediately be followed by either
386 // '0' or '1'.
387 static bool validate(char* query_c_string, size_t length) {
388 if (length <= 0) {
389 return true;
390 }
391 if (query_c_string[0] != '/') {
392 return false;
393 }
394 wuffs_base__slice_u8 s =
395 wuffs_base__make_slice_u8((uint8_t*)query_c_string, length);
396 bool previous_was_tilde = false;
397 while (s.len > 0) {
398 wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);
399 if (!o.is_valid()) {
400 return false;
401 }
402 if (previous_was_tilde && (o.code_point != '0') &&
403 (o.code_point != '1')) {
404 return false;
405 }
406 previous_was_tilde = o.code_point == '~';
407 s.ptr += o.byte_length;
408 s.len -= o.byte_length;
409 }
410 return !previous_was_tilde;
411 }
412} query;
413
414// ----
415
Nigel Tao68920952020-03-03 11:25:18 +1100416struct {
417 int remaining_argc;
418 char** remaining_argv;
419
420 bool compact;
421 size_t indent;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100422 char* query_c_string;
Nigel Tao68920952020-03-03 11:25:18 +1100423 bool tabs;
424} flags = {0};
425
426const char* //
427parse_flags(int argc, char** argv) {
428 bool explicit_indent = false;
429
430 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
431 for (; c < argc; c++) {
432 char* arg = argv[c];
433 if (*arg++ != '-') {
434 break;
435 }
436
437 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
438 // cases, a bare "-" is not a flag (some programs may interpret it as
439 // stdin) and a bare "--" means to stop parsing flags.
440 if (*arg == '\x00') {
441 break;
442 } else if (*arg == '-') {
443 arg++;
444 if (*arg == '\x00') {
445 c++;
446 break;
447 }
448 }
449
450 if (!strcmp(arg, "c") || !strcmp(arg, "compact")) {
451 flags.compact = true;
452 continue;
453 }
454 if (!strncmp(arg, "i=", 2) || !strncmp(arg, "indent=", 7)) {
455 while (*arg++ != '=') {
456 }
457 if (('0' <= arg[0]) && (arg[0] <= '8') && (arg[1] == '\x00')) {
458 flags.indent = arg[0] - '0';
459 explicit_indent = true;
460 continue;
461 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100462 return usage;
463 }
464 if (!strncmp(arg, "q=", 2) || !strncmp(arg, "query=", 6)) {
465 while (*arg++ != '=') {
466 }
467 if (Query::validate(arg, strlen(arg))) {
468 flags.query_c_string = arg;
469 continue;
470 }
471 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100472 }
473 if (!strcmp(arg, "t") || !strcmp(arg, "tabs")) {
474 flags.tabs = true;
475 continue;
476 }
477
Nigel Tao0cd2f982020-03-03 23:03:02 +1100478 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100479 }
480
481 flags.remaining_argc = argc - c;
482 flags.remaining_argv = argv + c;
483 if (!explicit_indent) {
484 flags.indent = flags.tabs ? 1 : 4;
485 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100486 return nullptr;
Nigel Tao68920952020-03-03 11:25:18 +1100487}
488
Nigel Tao2cf76db2020-02-27 22:42:01 +1100489const char* //
490initialize_globals(int argc, char** argv) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100491 dst = wuffs_base__make_io_buffer(
492 wuffs_base__make_slice_u8(dst_array, DST_BUFFER_SIZE),
493 wuffs_base__empty_io_buffer_meta());
Nigel Tao1b073492020-02-16 22:11:36 +1100494
Nigel Tao2cf76db2020-02-27 22:42:01 +1100495 src = wuffs_base__make_io_buffer(
496 wuffs_base__make_slice_u8(src_array, SRC_BUFFER_SIZE),
497 wuffs_base__empty_io_buffer_meta());
498
499 tok = wuffs_base__make_token_buffer(
500 wuffs_base__make_slice_token(tok_array, TOKEN_BUFFER_SIZE),
501 wuffs_base__empty_token_buffer_meta());
502
503 curr_token_end_src_index = 0;
504
Nigel Tao2cf76db2020-02-27 22:42:01 +1100505 depth = 0;
506
507 ctx = context::none;
508
Nigel Tao68920952020-03-03 11:25:18 +1100509 TRY(parse_flags(argc, argv));
510 if (flags.remaining_argc > 0) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100511 return usage;
Nigel Tao107f0ef2020-03-01 21:35:02 +1100512 }
513
Nigel Tao0cd2f982020-03-03 23:03:02 +1100514 query.reset(flags.query_c_string);
515
516 // If the query is non-empty, suprress writing to stdout until we've
517 // completed the query.
518 suppress_write_dst = query.next_fragment();
519 wrote_to_dst = false;
520
Nigel Tao2cf76db2020-02-27 22:42:01 +1100521 return dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0)
522 .message();
523}
Nigel Tao1b073492020-02-16 22:11:36 +1100524
525// ----
526
Nigel Tao2914bae2020-02-26 09:40:30 +1100527const char* //
528read_src() {
Nigel Taoa8406922020-02-19 12:22:00 +1100529 if (src.meta.closed) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100530 return "main: internal error: read requested on a closed source";
Nigel Taoa8406922020-02-19 12:22:00 +1100531 }
Nigel Tao1b073492020-02-16 22:11:36 +1100532 src.compact();
533 if (src.meta.wi >= src.data.len) {
534 return "main: src buffer is full";
535 }
536 size_t n = fread(src.data.ptr + src.meta.wi, sizeof(uint8_t),
537 src.data.len - src.meta.wi, stdin);
538 src.meta.wi += n;
Nigel Tao67306562020-02-19 14:04:49 +1100539 src.meta.closed = feof(stdin);
540 if ((n == 0) && !src.meta.closed) {
Nigel Taoa8406922020-02-19 12:22:00 +1100541 return "main: read error";
Nigel Tao1b073492020-02-16 22:11:36 +1100542 }
543 return nullptr;
544}
545
Nigel Tao2914bae2020-02-26 09:40:30 +1100546const char* //
547flush_dst() {
Nigel Tao1b073492020-02-16 22:11:36 +1100548 size_t n = dst.meta.wi - dst.meta.ri;
549 if (n > 0) {
550 size_t i = fwrite(dst.data.ptr + dst.meta.ri, sizeof(uint8_t), n, stdout);
551 dst.meta.ri += i;
552 if (i != n) {
553 return "main: write error";
554 }
555 dst.compact();
556 }
557 return nullptr;
558}
559
Nigel Tao2914bae2020-02-26 09:40:30 +1100560const char* //
561write_dst(const void* s, size_t n) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100562 if (suppress_write_dst) {
563 return nullptr;
564 }
Nigel Tao1b073492020-02-16 22:11:36 +1100565 const uint8_t* p = static_cast<const uint8_t*>(s);
566 while (n > 0) {
567 size_t i = dst.writer_available();
568 if (i == 0) {
569 const char* z = flush_dst();
570 if (z) {
571 return z;
572 }
573 i = dst.writer_available();
574 if (i == 0) {
575 return "main: dst buffer is full";
576 }
577 }
578
579 if (i > n) {
580 i = n;
581 }
582 memcpy(dst.data.ptr + dst.meta.wi, p, i);
583 dst.meta.wi += i;
584 p += i;
585 n -= i;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100586 wrote_to_dst = true;
Nigel Tao1b073492020-02-16 22:11:36 +1100587 }
588 return nullptr;
589}
590
591// ----
592
Nigel Tao2914bae2020-02-26 09:40:30 +1100593uint8_t //
594hex_digit(uint8_t nibble) {
Nigel Taob5461bd2020-02-21 14:13:37 +1100595 nibble &= 0x0F;
596 if (nibble <= 9) {
597 return '0' + nibble;
598 }
599 return ('A' - 10) + nibble;
600}
601
Nigel Tao2914bae2020-02-26 09:40:30 +1100602const char* //
Nigel Tao3b486982020-02-27 15:05:59 +1100603handle_unicode_code_point(uint32_t ucp) {
604 if (ucp < 0x0020) {
605 switch (ucp) {
606 case '\b':
607 return write_dst("\\b", 2);
608 case '\f':
609 return write_dst("\\f", 2);
610 case '\n':
611 return write_dst("\\n", 2);
612 case '\r':
613 return write_dst("\\r", 2);
614 case '\t':
615 return write_dst("\\t", 2);
616 default: {
617 // Other bytes less than 0x0020 are valid UTF-8 but not valid in a
618 // JSON string. They need to remain escaped.
619 uint8_t esc6[6];
620 esc6[0] = '\\';
621 esc6[1] = 'u';
622 esc6[2] = '0';
623 esc6[3] = '0';
624 esc6[4] = hex_digit(ucp >> 4);
625 esc6[5] = hex_digit(ucp >> 0);
626 return write_dst(&esc6[0], 6);
627 }
628 }
629
Nigel Taob9ad34f2020-03-03 12:44:01 +1100630 } else if (ucp == '\"') {
631 return write_dst("\\\"", 2);
632
633 } else if (ucp == '\\') {
634 return write_dst("\\\\", 2);
635
636 } else {
637 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
638 size_t n = wuffs_base__utf_8__encode(
639 wuffs_base__make_slice_u8(&u[0],
640 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
641 ucp);
642 if (n > 0) {
643 return write_dst(&u[0], n);
Nigel Tao3b486982020-02-27 15:05:59 +1100644 }
Nigel Tao3b486982020-02-27 15:05:59 +1100645 }
646
Nigel Tao2cf76db2020-02-27 22:42:01 +1100647 return "main: internal error: unexpected Unicode code point";
Nigel Tao3b486982020-02-27 15:05:59 +1100648}
649
650const char* //
Nigel Tao2cf76db2020-02-27 22:42:01 +1100651handle_token(wuffs_base__token t) {
652 do {
653 uint64_t vbc = t.value_base_category();
654 uint64_t vbd = t.value_base_detail();
655 uint64_t len = t.length();
Nigel Tao1b073492020-02-16 22:11:36 +1100656
657 // Handle ']' or '}'.
Nigel Tao9f7a2502020-02-23 09:42:02 +1100658 if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
Nigel Tao2cf76db2020-02-27 22:42:01 +1100659 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100660 if (query.is_at(depth)) {
661 return "main: no match for query";
662 }
Nigel Tao1b073492020-02-16 22:11:36 +1100663 if (depth <= 0) {
664 return "main: internal error: inconsistent depth";
665 }
666 depth--;
667
668 // Write preceding whitespace.
669 if ((ctx != context::in_list_after_bracket) &&
Nigel Tao68920952020-03-03 11:25:18 +1100670 (ctx != context::in_dict_after_brace) && !flags.compact) {
Nigel Tao1b073492020-02-16 22:11:36 +1100671 TRY(write_dst("\n", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100672 for (uint32_t i = 0; i < depth; i++) {
Nigel Tao68920952020-03-03 11:25:18 +1100673 TRY(write_dst(flags.tabs ? INDENT_TABS_STRING : INDENT_SPACES_STRING,
674 flags.indent));
Nigel Tao1b073492020-02-16 22:11:36 +1100675 }
676 }
677
Nigel Tao9f7a2502020-02-23 09:42:02 +1100678 TRY(write_dst(
679 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST) ? "]" : "}", 1));
680 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
681 ? context::in_list_after_value
682 : context::in_dict_after_key;
Nigel Tao1b073492020-02-16 22:11:36 +1100683 goto after_value;
684 }
685
Nigel Taod1c928a2020-02-28 12:43:53 +1100686 // Write preceding whitespace and punctuation, if it wasn't ']', '}' or a
687 // continuation of a multi-token chain.
Nigel Tao0cd2f982020-03-03 23:03:02 +1100688 if (!t.link_prev()) {
689 if (ctx == context::in_dict_after_key) {
690 TRY(write_dst(": ", flags.compact ? 1 : 2));
691 } else if (ctx != context::none) {
692 if ((ctx != context::in_list_after_bracket) &&
693 (ctx != context::in_dict_after_brace)) {
694 TRY(write_dst(",", 1));
Nigel Tao107f0ef2020-03-01 21:35:02 +1100695 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100696 if (!flags.compact) {
697 TRY(write_dst("\n", 1));
698 for (size_t i = 0; i < depth; i++) {
699 TRY(write_dst(
700 flags.tabs ? INDENT_TABS_STRING : INDENT_SPACES_STRING,
701 flags.indent));
702 }
703 }
704 }
705
706 bool query_matched = false;
707 if (query.is_at(depth)) {
708 switch (ctx) {
709 case context::in_list_after_bracket:
710 case context::in_list_after_value:
711 query_matched = query.tick();
712 break;
713 case context::in_dict_after_key:
714 query_matched = query.matched();
715 break;
716 }
717 }
718 if (!query_matched) {
719 // No-op.
720 } else if (!query.next_fragment()) {
721 // There is no next fragment. We have matched the complete query, and
722 // the upcoming JSON value is the result of that query.
723 //
724 // Un-suppress writing to stdout and reset the ctx and depth as if we
725 // were about to decode a top-level value. This makes any subsequent
726 // indentation be relative to this point, and we will return eod after
727 // the upcoming JSON value is complete.
728 suppress_write_dst = false;
729 ctx = context::none;
730 depth = 0;
731 } else if ((vbc != WUFFS_BASE__TOKEN__VBC__STRUCTURE) ||
732 !(vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH)) {
733 // The query has moved on to the next fragment but the upcoming JSON
734 // value is not a container.
735 return "main: no match for query";
Nigel Tao1b073492020-02-16 22:11:36 +1100736 }
737 }
738
739 // Handle the token itself: either a container ('[' or '{') or a simple
Nigel Tao85fba7f2020-02-29 16:28:06 +1100740 // value: string (a chain of raw or escaped parts), literal or number.
Nigel Tao1b073492020-02-16 22:11:36 +1100741 switch (vbc) {
Nigel Tao85fba7f2020-02-29 16:28:06 +1100742 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
743 TRY(write_dst(
744 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) ? "[" : "{", 1));
745 depth++;
746 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
747 ? context::in_list_after_bracket
748 : context::in_dict_after_brace;
749 return nullptr;
750
Nigel Tao2cf76db2020-02-27 22:42:01 +1100751 case WUFFS_BASE__TOKEN__VBC__STRING:
Nigel Taod1c928a2020-02-28 12:43:53 +1100752 if (!t.link_prev()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100753 TRY(write_dst("\"", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100754 query.restart_fragment(in_dict_before_key() && query.is_at(depth));
Nigel Tao2cf76db2020-02-27 22:42:01 +1100755 }
Nigel Taocb37a562020-02-28 09:56:24 +1100756
757 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
758 // No-op.
759 } else if (vbd &
760 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100761 uint8_t* ptr = src.data.ptr + curr_token_end_src_index - len;
762 TRY(write_dst(ptr, len));
763 query.incremental_match_slice(ptr, len);
Nigel Taocb37a562020-02-28 09:56:24 +1100764 } else {
765 return "main: internal error: unexpected string-token conversion";
766 }
767
Nigel Taod1c928a2020-02-28 12:43:53 +1100768 if (t.link_next()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100769 return nullptr;
770 }
771 TRY(write_dst("\"", 1));
772 goto after_value;
773
774 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT:
Nigel Tao0cd2f982020-03-03 23:03:02 +1100775 if (!t.link_prev() || !t.link_next()) {
776 return "main: internal error: unexpected unlinked token";
777 }
778 TRY(handle_unicode_code_point(vbd));
779 query.incremental_match_code_point(vbd);
780 return nullptr;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100781
Nigel Tao85fba7f2020-02-29 16:28:06 +1100782 case WUFFS_BASE__TOKEN__VBC__LITERAL:
Nigel Tao2cf76db2020-02-27 22:42:01 +1100783 case WUFFS_BASE__TOKEN__VBC__NUMBER:
784 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
785 goto after_value;
Nigel Tao1b073492020-02-16 22:11:36 +1100786 }
787
788 // Return an error if we didn't match the (vbc, vbd) pair.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100789 return "main: internal error: unexpected token";
790 } while (0);
Nigel Tao1b073492020-02-16 22:11:36 +1100791
Nigel Tao2cf76db2020-02-27 22:42:01 +1100792 // Book-keeping after completing a value (whether a container value or a
793 // simple value). Empty parent containers are no longer empty. If the parent
794 // container is a "{...}" object, toggle between keys and values.
795after_value:
796 if (depth == 0) {
797 return eod;
798 }
799 switch (ctx) {
800 case context::in_list_after_bracket:
801 ctx = context::in_list_after_value;
802 break;
803 case context::in_dict_after_brace:
804 ctx = context::in_dict_after_key;
805 break;
806 case context::in_dict_after_key:
807 ctx = context::in_dict_after_value;
808 break;
809 case context::in_dict_after_value:
810 ctx = context::in_dict_after_key;
811 break;
812 }
813 return nullptr;
814}
815
816const char* //
817main1(int argc, char** argv) {
818 TRY(initialize_globals(argc, argv));
819
820 while (true) {
821 wuffs_base__status status = dec.decode_tokens(&tok, &src);
822
823 while (tok.meta.ri < tok.meta.wi) {
824 wuffs_base__token t = tok.data.ptr[tok.meta.ri++];
825 uint64_t n = t.length();
826 if ((src.meta.ri - curr_token_end_src_index) < n) {
827 return "main: internal error: inconsistent src indexes";
828 }
829 curr_token_end_src_index += n;
830
831 if (t.value() == 0) {
832 continue;
833 }
834
835 const char* z = handle_token(t);
836 if (z == nullptr) {
837 continue;
838 } else if (z == eod) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100839 goto end_of_data;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100840 }
841 return z;
Nigel Tao1b073492020-02-16 22:11:36 +1100842 }
Nigel Tao2cf76db2020-02-27 22:42:01 +1100843
844 if (status.repr == nullptr) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100845 return "main: internal error: unexpected end of token stream";
Nigel Tao2cf76db2020-02-27 22:42:01 +1100846 } else if (status.repr == wuffs_base__suspension__short_read) {
847 if (curr_token_end_src_index != src.meta.ri) {
848 return "main: internal error: inconsistent src indexes";
849 }
850 TRY(read_src());
851 curr_token_end_src_index = src.meta.ri;
852 } else if (status.repr == wuffs_base__suspension__short_write) {
853 tok.compact();
854 } else {
855 return status.message();
Nigel Tao1b073492020-02-16 22:11:36 +1100856 }
857 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100858end_of_data:
859
860 // With a non-empty query, don't try to consume trailing whitespace or
861 // confirm that we've processed all the tokens.
862 if (flags.query_c_string && *flags.query_c_string) {
863 return nullptr;
864 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100865
Nigel Tao6b161af2020-02-24 11:01:48 +1100866 // Consume an optional whitespace trailer. This isn't part of the JSON spec,
867 // but it works better with line oriented Unix tools (such as "echo 123 |
868 // jsonptr" where it's "echo", not "echo -n") or hand-edited JSON files which
869 // can accidentally contain trailing whitespace.
870 //
871 // A whitespace trailer is zero or more ' ' and then zero or one '\n'.
872 while (true) {
873 if (src.meta.ri < src.meta.wi) {
874 uint8_t c = src.data.ptr[src.meta.ri];
875 if (c == ' ') {
876 src.meta.ri++;
877 continue;
878 } else if (c == '\n') {
879 src.meta.ri++;
880 break;
881 }
882 // The "exhausted the input" check below will fail.
883 break;
884 } else if (src.meta.closed) {
885 break;
886 }
887 TRY(read_src());
888 }
889
890 // Check that we've exhausted the input.
891 if ((src.meta.ri < src.meta.wi) || !src.meta.closed) {
892 return "main: valid JSON followed by further (unexpected) data";
893 }
894
895 // Check that we've used all of the decoded tokens, other than trailing
896 // filler tokens. For example, a bare `"foo"` string is valid JSON, but even
897 // without a trailing '\n', the Wuffs JSON parser emits a filler token for
898 // the final '\"'.
899 for (; tok.meta.ri < tok.meta.wi; tok.meta.ri++) {
900 if (tok.data.ptr[tok.meta.ri].value_base_category() !=
901 WUFFS_BASE__TOKEN__VBC__FILLER) {
902 return "main: internal error: decoded OK but unprocessed tokens remain";
903 }
904 }
905
906 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +1100907}
908
Nigel Tao2914bae2020-02-26 09:40:30 +1100909int //
910compute_exit_code(const char* status_msg) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100911 if (!status_msg) {
912 return 0;
913 }
914 size_t n = strnlen(status_msg, 2047);
915 if (n >= 2047) {
916 status_msg = "main: internal error: error message is too long";
917 n = strnlen(status_msg, 2047);
918 }
919 fprintf(stderr, "%s\n", status_msg);
920 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
921 // formatted or unsupported input.
922 //
923 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
924 // run-time checks found that an internal invariant did not hold.
925 //
926 // Automated testing, including badly formatted inputs, can therefore
927 // discriminate between expected failure (exit code 1) and unexpected failure
928 // (other non-zero exit codes). Specifically, exit code 2 for internal
929 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
930 // linux) for a segmentation fault (e.g. null pointer dereference).
931 return strstr(status_msg, "internal error:") ? 2 : 1;
932}
933
Nigel Tao2914bae2020-02-26 09:40:30 +1100934int //
935main(int argc, char** argv) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100936 const char* z = main1(argc, argv);
937 if (wrote_to_dst) {
938 const char* z1 = write_dst("\n", 1);
939 const char* z2 = flush_dst();
940 z = z ? z : (z1 ? z1 : z2);
941 }
942 int exit_code = compute_exit_code(z);
Nigel Tao9cc2c252020-02-23 17:05:49 +1100943 return exit_code;
Nigel Tao1b073492020-02-16 22:11:36 +1100944}