blob: fadbadbd11398f080bde791858a05fbaa289a9a7 [file] [log] [blame]
Nigel Tao1b073492020-02-16 22:11:36 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
Nigel Tao0cd2f982020-03-03 23:03:02 +110018jsonptr is a JSON formatter (pretty-printer) that supports the JSON Pointer
19(RFC 6901) query syntax. It reads UTF-8 JSON from stdin and writes
20canonicalized, formatted UTF-8 JSON to stdout.
21
22See the "const char* usage" string below for details.
23
24----
25
26JSON Pointer (and this program's implementation) is one of many JSON query
27languages and JSON tools, such as jq, jql and JMESPath. This one is relatively
28simple and fewer-featured compared to those others.
29
30One benefit of simplicity is that this program's JSON and JSON Pointer
31implementations do not dynamically allocate or free memory (yet it does not
32require that the entire input fits in memory at once). They are therefore
33trivially protected against certain bug classes: memory leaks, double-frees and
34use-after-frees.
35
36The core JSON implementation is also written in the Wuffs programming language
37(and then transpiled to C/C++), which is memory-safe but also guards against
38integer arithmetic overflows.
39
Nigel Taofe0cbbd2020-03-05 22:01:30 +110040For defense in depth, on Linux, this program also self-imposes a
41SECCOMP_MODE_STRICT sandbox before reading (or otherwise processing) its input
42or writing its output. Under this sandbox, the only permitted system calls are
43read, write, exit and sigreturn.
44
Nigel Tao0cd2f982020-03-03 23:03:02 +110045All together, this program aims to safely handle untrusted JSON files without
46fear of security bugs such as remote code execution.
47
48----
Nigel Tao1b073492020-02-16 22:11:36 +110049
Nigel Taoc5b3a9e2020-02-24 11:54:35 +110050As of 2020-02-24, this program passes all 318 "test_parsing" cases from the
51JSON test suite (https://github.com/nst/JSONTestSuite), an appendix to the
52"Parsing JSON is a Minefield" article (http://seriot.ch/parsing_json.php) that
53was first published on 2016-10-26 and updated on 2018-03-30.
54
Nigel Tao0cd2f982020-03-03 23:03:02 +110055After modifying this program, run "build-example.sh example/jsonptr/" and then
56"script/run-json-test-suite.sh" to catch correctness regressions.
57
58----
59
Nigel Tao1b073492020-02-16 22:11:36 +110060This example program differs from most other example Wuffs programs in that it
61is written in C++, not C.
62
63$CXX jsonptr.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
64
65for a C++ compiler $CXX, such as clang++ or g++.
66*/
67
Nigel Taofe0cbbd2020-03-05 22:01:30 +110068#include <errno.h>
Nigel Tao9cc2c252020-02-23 17:05:49 +110069#include <string.h>
Nigel Taofe0cbbd2020-03-05 22:01:30 +110070#include <unistd.h>
Nigel Tao1b073492020-02-16 22:11:36 +110071
72// Wuffs ships as a "single file C library" or "header file library" as per
73// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
74//
75// To use that single file as a "foo.c"-like implementation, instead of a
76// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
77// compiling it.
78#define WUFFS_IMPLEMENTATION
79
80// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
81// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
82// the entire Wuffs standard library, implementing a variety of codecs and file
83// formats. Without this macro definition, an optimizing compiler or linker may
84// very well discard Wuffs code for unused codecs, but listing the Wuffs
85// modules we use makes that process explicit. Preprocessing means that such
86// code simply isn't compiled.
87#define WUFFS_CONFIG__MODULES
88#define WUFFS_CONFIG__MODULE__BASE
89#define WUFFS_CONFIG__MODULE__JSON
90
91// If building this program in an environment that doesn't easily accommodate
92// relative includes, you can use the script/inline-c-relative-includes.go
93// program to generate a stand-alone C++ file.
94#include "../../release/c/wuffs-unsupported-snapshot.c"
95
Nigel Taofe0cbbd2020-03-05 22:01:30 +110096#if defined(__linux__)
97#include <linux/prctl.h>
98#include <linux/seccomp.h>
99#include <sys/prctl.h>
100#include <sys/syscall.h>
101#define WUFFS_EXAMPLE_USE_SECCOMP
102#endif
103
Nigel Tao2cf76db2020-02-27 22:42:01 +1100104#define TRY(error_msg) \
105 do { \
106 const char* z = error_msg; \
107 if (z) { \
108 return z; \
109 } \
110 } while (false)
111
112static const char* eod = "main: end of data";
113
Nigel Tao0cd2f982020-03-03 23:03:02 +1100114static const char* usage =
115 "Usage: jsonptr -flags < input.json\n"
116 "\n"
117 "Note the \"<\". It only reads from stdin, not named files.\n"
118 "\n"
119 "jsonptr is a JSON formatter (pretty-printer) that supports the JSON\n"
120 "Pointer (RFC 6901) query syntax. It reads UTF-8 JSON from stdin and\n"
121 "writes canonicalized, formatted UTF-8 JSON to stdout.\n"
122 "\n"
123 "Canonicalized means that e.g. \"abc\\u000A\\tx\\u0177z\" is re-written\n"
124 "as \"abc\\n\\txŷz\". It does not sort object keys, nor does it reject\n"
125 "duplicate keys.\n"
126 "\n"
127 "Formatted means that arrays' and objects' elements are indented, each\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100128 "on its own line. Configure this with the -c / -compact, -i=N / -indent=N\n"
129 "(for N ranging from 0 to 8) and -t / -tabs flags.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100130 "\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100131 "The -q=etc or -query=etc flag gives an optional JSON Pointer query, to\n"
132 "print a subset of the input. For example, given RFC 6901 section 5's\n"
133 "[sample input](https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100134 " jsonptr -query=/foo/1 < rfc-6901-json-pointer.json\n"
135 "will print:\n"
136 " \"baz\"\n"
137 "\n"
138 "An absent query is equivalent to the empty query, which identifies the\n"
139 "entire input (the root value). The \"/\" query is not equivalent to the\n"
140 "root value. Instead, it identifies the child (the key-value pair) of the\n"
141 "root value whose key is the empty string.\n"
142 "\n"
143 "If the query found a valid JSON value, this program will return a zero\n"
144 "exit code even if the rest of the input isn't valid JSON. If the query\n"
145 "did not find a value, or found an invalid one, this program returns a\n"
146 "non-zero exit code, but may still print partial output to stdout.\n"
147 "\n"
148 "The [JSON specification](https://json.org/) permits implementations that\n"
149 "allow duplicate keys, as this one does. This JSON Pointer implementation\n"
150 "is also greedy, following the first match for each fragment without\n"
151 "back-tracking. For example, the \"/foo/bar\" query will fail if the root\n"
152 "object has multiple \"foo\" children but the first one doesn't have a\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100153 "\"bar\" child, even if later ones do.\n"
154 "\n"
155 "The -fail-if-unsandboxed flag causes the program to exit if it does not\n"
156 "self-impose a sandbox. On Linux, it self-imposes a SECCOMP_MODE_STRICT\n"
157 "sandbox, regardless of this flag.";
Nigel Tao0cd2f982020-03-03 23:03:02 +1100158
Nigel Tao2cf76db2020-02-27 22:42:01 +1100159// ----
160
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100161bool sandboxed = false;
162
Nigel Tao2cf76db2020-02-27 22:42:01 +1100163#define MAX_INDENT 8
Nigel Tao107f0ef2020-03-01 21:35:02 +1100164#define INDENT_SPACES_STRING " "
165#define INDENT_TABS_STRING "\t\t\t\t\t\t\t\t"
166
Nigel Tao1b073492020-02-16 22:11:36 +1100167#ifndef DST_BUFFER_SIZE
168#define DST_BUFFER_SIZE (32 * 1024)
169#endif
170#ifndef SRC_BUFFER_SIZE
171#define SRC_BUFFER_SIZE (32 * 1024)
172#endif
173#ifndef TOKEN_BUFFER_SIZE
174#define TOKEN_BUFFER_SIZE (4 * 1024)
175#endif
176
Nigel Tao2cf76db2020-02-27 22:42:01 +1100177uint8_t dst_array[DST_BUFFER_SIZE];
178uint8_t src_array[SRC_BUFFER_SIZE];
179wuffs_base__token tok_array[TOKEN_BUFFER_SIZE];
Nigel Tao1b073492020-02-16 22:11:36 +1100180
181wuffs_base__io_buffer dst;
182wuffs_base__io_buffer src;
183wuffs_base__token_buffer tok;
184
Nigel Tao2cf76db2020-02-27 22:42:01 +1100185// curr_token_end_src_index is the src.data.ptr index of the end of the current
186// token. An invariant is that (curr_token_end_src_index <= src.meta.ri).
187size_t curr_token_end_src_index;
188
Nigel Tao0cd2f982020-03-03 23:03:02 +1100189uint32_t depth;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100190
191enum class context {
192 none,
193 in_list_after_bracket,
194 in_list_after_value,
195 in_dict_after_brace,
196 in_dict_after_key,
197 in_dict_after_value,
198} ctx;
199
Nigel Tao0cd2f982020-03-03 23:03:02 +1100200bool //
201in_dict_before_key() {
202 return (ctx == context::in_dict_after_brace) ||
203 (ctx == context::in_dict_after_value);
204}
205
206bool suppress_write_dst;
207bool wrote_to_dst;
208
Nigel Tao1b073492020-02-16 22:11:36 +1100209wuffs_json__decoder dec;
Nigel Tao1b073492020-02-16 22:11:36 +1100210
Nigel Tao0cd2f982020-03-03 23:03:02 +1100211// ----
212
213// Query is a JSON Pointer query. After initializing with a NUL-terminated C
214// string, its multiple fragments are consumed as the program walks the JSON
215// data from stdin. For example, letting "$" denote a NUL, suppose that we
216// started with a query string of "/apple/banana/12/durian" and are currently
217// trying to match the second fragment, "banana", so that Query::depth is 2:
218//
219// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
220// / a p p l e / b a n a n a / 1 2 / d u r i a n $
221// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
222// ^ ^
223// frag_i frag_k
224//
225// The two pointers frag_i and frag_k are the start (inclusive) and end
226// (exclusive) of the fragment. They satisfy (frag_i <= frag_k) and may be
227// equal if the fragment empty (note that "" is a valid JSON object key).
228//
229// The frag_j pointer moves between these two, or is nullptr. An invariant is
230// that (((frag_i <= frag_j) && (frag_j <= frag_k)) || (frag_j == nullptr)).
231//
232// Wuffs' JSON tokenizer can portray a single JSON string as multiple Wuffs
233// tokens, as backslash-escaped values within that JSON string may each get
234// their own token.
235//
236// At the start of each object key (a JSON string), frag_j is set to frag_i.
237//
238// While frag_j remains non-nullptr, each token's unescaped contents are then
239// compared to that part of the fragment from frag_j to frag_k. If it is a
240// prefix (including the case of an exact match), then frag_j is advanced by
241// the unescaped length. Otherwise, frag_j is set to nullptr.
242//
243// Comparison accounts for JSON Pointer's escaping notation: "~0" and "~1" in
244// the query (not the JSON value) are unescaped to "~" and "/" respectively.
245//
246// The frag_j pointer therefore advances from frag_i to frag_k, or drops out,
247// as we incrementally match the object key with the query fragment. For
248// example, if we have already matched the "ban" of "banana", then we would
249// accept any of an "ana" token, an "a" token or a "\u0061" token, amongst
250// others. They would advance frag_j by 3, 1 or 1 bytes respectively.
251//
252// frag_j
253// v
254// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
255// / a p p l e / b a n a n a / 1 2 / d u r i a n $
256// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
257// ^ ^
258// frag_i frag_k
259//
260// At the end of each object key (or equivalently, at the start of each object
261// value), if frag_j is non-nullptr and equal to (but not less than) frag_k
262// then we have a fragment match: the query fragment equals the object key. If
263// there is a next fragment (in this example, "12") we move the frag_etc
264// pointers to its start and end and increment Query::depth. Otherwise, we have
265// matched the complete query, and the upcoming JSON value is the result of
266// that query.
267//
268// The discussion above centers on object keys. If the query fragment is
269// numeric then it can also match as an array index: the string fragment "12"
270// will match an array's 13th element (starting counting from zero). See RFC
271// 6901 for its precise definition of an "array index" number.
272//
273// Array index fragment match is represented by the Query::array_index field,
274// whose type (wuffs_base__result_u64) is a result type. An error result means
275// that the fragment is not an array index. A value result holds the number of
276// list elements remaining. When matching a query fragment in an array (instead
277// of in an object), each element ticks this number down towards zero. At zero,
278// the upcoming JSON value is the one that matches the query fragment.
279class Query {
280 private:
281 uint8_t* frag_i;
282 uint8_t* frag_j;
283 uint8_t* frag_k;
284
285 uint32_t depth;
286
287 wuffs_base__result_u64 array_index;
288
289 public:
290 void reset(char* query_c_string) {
291 this->frag_i = (uint8_t*)query_c_string;
292 this->frag_j = (uint8_t*)query_c_string;
293 this->frag_k = (uint8_t*)query_c_string;
294 this->depth = 0;
295 this->array_index.status.repr = "#main: not an array index query fragment";
296 this->array_index.value = 0;
297 }
298
299 void restart_fragment(bool enable) {
300 this->frag_j = enable ? this->frag_i : nullptr;
301 }
302
303 bool is_at(uint32_t depth) { return this->depth == depth; }
304
305 // tick returns whether the fragment is a valid array index whose value is
306 // zero. If valid but non-zero, it decrements it and returns false.
307 bool tick() {
308 if (this->array_index.status.is_ok()) {
309 if (this->array_index.value == 0) {
310 return true;
311 }
312 this->array_index.value--;
313 }
314 return false;
315 }
316
317 // next_fragment moves to the next fragment, returning whether it existed.
318 bool next_fragment() {
319 uint8_t* k = this->frag_k;
320 uint32_t d = this->depth;
321
322 this->reset(nullptr);
323
324 if (!k || (*k != '/')) {
325 return false;
326 }
327 k++;
328
329 bool all_digits = true;
330 uint8_t* i = k;
331 while ((*k != '\x00') && (*k != '/')) {
332 all_digits = all_digits && ('0' <= *k) && (*k <= '9');
333 k++;
334 }
335 this->frag_i = i;
336 this->frag_j = i;
337 this->frag_k = k;
338 this->depth = d + 1;
339 if (all_digits) {
340 // wuffs_base__parse_number_u64 rejects leading zeroes, e.g. "00", "07".
341 this->array_index =
342 wuffs_base__parse_number_u64(wuffs_base__make_slice_u8(i, k - i));
343 }
344 return true;
345 }
346
347 bool matched() { return this->frag_j && (this->frag_j == this->frag_k); }
348
349 void incremental_match_slice(uint8_t* ptr, size_t len) {
350 if (!this->frag_j) {
351 return;
352 }
353 uint8_t* j = this->frag_j;
354 while (true) {
355 if (len == 0) {
356 this->frag_j = j;
357 return;
358 }
359
360 if (*j == '\x00') {
361 break;
362
363 } else if (*j == '~') {
364 j++;
365 if (*j == '0') {
366 if (*ptr != '~') {
367 break;
368 }
369 } else if (*j == '1') {
370 if (*ptr != '/') {
371 break;
372 }
373 } else {
374 break;
375 }
376
377 } else if (*j != *ptr) {
378 break;
379 }
380
381 j++;
382 ptr++;
383 len--;
384 }
385 this->frag_j = nullptr;
386 }
387
388 void incremental_match_code_point(uint32_t code_point) {
389 if (!this->frag_j) {
390 return;
391 }
392 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
393 size_t n = wuffs_base__utf_8__encode(
394 wuffs_base__make_slice_u8(&u[0],
395 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
396 code_point);
397 if (n > 0) {
398 this->incremental_match_slice(&u[0], n);
399 }
400 }
401
402 // validate returns whether the (ptr, len) arguments form a valid JSON
403 // Pointer. In particular, it must be valid UTF-8, and either be empty or
404 // start with a '/'. Any '~' within must immediately be followed by either
405 // '0' or '1'.
406 static bool validate(char* query_c_string, size_t length) {
407 if (length <= 0) {
408 return true;
409 }
410 if (query_c_string[0] != '/') {
411 return false;
412 }
413 wuffs_base__slice_u8 s =
414 wuffs_base__make_slice_u8((uint8_t*)query_c_string, length);
415 bool previous_was_tilde = false;
416 while (s.len > 0) {
417 wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);
418 if (!o.is_valid()) {
419 return false;
420 }
421 if (previous_was_tilde && (o.code_point != '0') &&
422 (o.code_point != '1')) {
423 return false;
424 }
425 previous_was_tilde = o.code_point == '~';
426 s.ptr += o.byte_length;
427 s.len -= o.byte_length;
428 }
429 return !previous_was_tilde;
430 }
431} query;
432
433// ----
434
Nigel Tao68920952020-03-03 11:25:18 +1100435struct {
436 int remaining_argc;
437 char** remaining_argv;
438
439 bool compact;
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100440 bool fail_if_unsandboxed;
Nigel Tao68920952020-03-03 11:25:18 +1100441 size_t indent;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100442 char* query_c_string;
Nigel Tao68920952020-03-03 11:25:18 +1100443 bool tabs;
444} flags = {0};
445
446const char* //
447parse_flags(int argc, char** argv) {
448 bool explicit_indent = false;
449
450 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
451 for (; c < argc; c++) {
452 char* arg = argv[c];
453 if (*arg++ != '-') {
454 break;
455 }
456
457 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
458 // cases, a bare "-" is not a flag (some programs may interpret it as
459 // stdin) and a bare "--" means to stop parsing flags.
460 if (*arg == '\x00') {
461 break;
462 } else if (*arg == '-') {
463 arg++;
464 if (*arg == '\x00') {
465 c++;
466 break;
467 }
468 }
469
470 if (!strcmp(arg, "c") || !strcmp(arg, "compact")) {
471 flags.compact = true;
472 continue;
473 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100474 if (!strcmp(arg, "fail-if-unsandboxed")) {
475 flags.fail_if_unsandboxed = true;
476 continue;
477 }
Nigel Tao68920952020-03-03 11:25:18 +1100478 if (!strncmp(arg, "i=", 2) || !strncmp(arg, "indent=", 7)) {
479 while (*arg++ != '=') {
480 }
481 if (('0' <= arg[0]) && (arg[0] <= '8') && (arg[1] == '\x00')) {
482 flags.indent = arg[0] - '0';
483 explicit_indent = true;
484 continue;
485 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100486 return usage;
487 }
488 if (!strncmp(arg, "q=", 2) || !strncmp(arg, "query=", 6)) {
489 while (*arg++ != '=') {
490 }
491 if (Query::validate(arg, strlen(arg))) {
492 flags.query_c_string = arg;
493 continue;
494 }
495 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100496 }
497 if (!strcmp(arg, "t") || !strcmp(arg, "tabs")) {
498 flags.tabs = true;
499 continue;
500 }
501
Nigel Tao0cd2f982020-03-03 23:03:02 +1100502 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100503 }
504
505 flags.remaining_argc = argc - c;
506 flags.remaining_argv = argv + c;
507 if (!explicit_indent) {
508 flags.indent = flags.tabs ? 1 : 4;
509 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100510 return nullptr;
Nigel Tao68920952020-03-03 11:25:18 +1100511}
512
Nigel Tao2cf76db2020-02-27 22:42:01 +1100513const char* //
514initialize_globals(int argc, char** argv) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100515 dst = wuffs_base__make_io_buffer(
516 wuffs_base__make_slice_u8(dst_array, DST_BUFFER_SIZE),
517 wuffs_base__empty_io_buffer_meta());
Nigel Tao1b073492020-02-16 22:11:36 +1100518
Nigel Tao2cf76db2020-02-27 22:42:01 +1100519 src = wuffs_base__make_io_buffer(
520 wuffs_base__make_slice_u8(src_array, SRC_BUFFER_SIZE),
521 wuffs_base__empty_io_buffer_meta());
522
523 tok = wuffs_base__make_token_buffer(
524 wuffs_base__make_slice_token(tok_array, TOKEN_BUFFER_SIZE),
525 wuffs_base__empty_token_buffer_meta());
526
527 curr_token_end_src_index = 0;
528
Nigel Tao2cf76db2020-02-27 22:42:01 +1100529 depth = 0;
530
531 ctx = context::none;
532
Nigel Tao68920952020-03-03 11:25:18 +1100533 TRY(parse_flags(argc, argv));
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100534 if (flags.fail_if_unsandboxed && !sandboxed) {
535 return "main: unsandboxed";
536 }
Nigel Tao68920952020-03-03 11:25:18 +1100537 if (flags.remaining_argc > 0) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100538 return usage;
Nigel Tao107f0ef2020-03-01 21:35:02 +1100539 }
540
Nigel Tao0cd2f982020-03-03 23:03:02 +1100541 query.reset(flags.query_c_string);
542
543 // If the query is non-empty, suprress writing to stdout until we've
544 // completed the query.
545 suppress_write_dst = query.next_fragment();
546 wrote_to_dst = false;
547
Nigel Tao2cf76db2020-02-27 22:42:01 +1100548 return dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0)
549 .message();
550}
Nigel Tao1b073492020-02-16 22:11:36 +1100551
552// ----
553
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100554// ignore_return_value suppresses errors from -Wall -Werror.
555static void //
556ignore_return_value(int ignored) {}
557
Nigel Tao2914bae2020-02-26 09:40:30 +1100558const char* //
559read_src() {
Nigel Taoa8406922020-02-19 12:22:00 +1100560 if (src.meta.closed) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100561 return "main: internal error: read requested on a closed source";
Nigel Taoa8406922020-02-19 12:22:00 +1100562 }
Nigel Tao1b073492020-02-16 22:11:36 +1100563 src.compact();
564 if (src.meta.wi >= src.data.len) {
565 return "main: src buffer is full";
566 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100567 while (true) {
568 const int stdin_fd = 0;
569 ssize_t n =
570 read(stdin_fd, src.data.ptr + src.meta.wi, src.data.len - src.meta.wi);
571 if (n >= 0) {
572 src.meta.wi += n;
573 src.meta.closed = n == 0;
574 break;
575 } else if (errno != EINTR) {
576 return strerror(errno);
577 }
Nigel Tao1b073492020-02-16 22:11:36 +1100578 }
579 return nullptr;
580}
581
Nigel Tao2914bae2020-02-26 09:40:30 +1100582const char* //
583flush_dst() {
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100584 while (true) {
585 size_t n = dst.meta.wi - dst.meta.ri;
586 if (n == 0) {
587 break;
Nigel Tao1b073492020-02-16 22:11:36 +1100588 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100589 const int stdout_fd = 1;
590 ssize_t i = write(stdout_fd, dst.data.ptr + dst.meta.ri, n);
591 if (i >= 0) {
592 dst.meta.ri += i;
593 } else if (errno != EINTR) {
594 return strerror(errno);
595 }
Nigel Tao1b073492020-02-16 22:11:36 +1100596 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100597 dst.compact();
Nigel Tao1b073492020-02-16 22:11:36 +1100598 return nullptr;
599}
600
Nigel Tao2914bae2020-02-26 09:40:30 +1100601const char* //
602write_dst(const void* s, size_t n) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100603 if (suppress_write_dst) {
604 return nullptr;
605 }
Nigel Tao1b073492020-02-16 22:11:36 +1100606 const uint8_t* p = static_cast<const uint8_t*>(s);
607 while (n > 0) {
608 size_t i = dst.writer_available();
609 if (i == 0) {
610 const char* z = flush_dst();
611 if (z) {
612 return z;
613 }
614 i = dst.writer_available();
615 if (i == 0) {
616 return "main: dst buffer is full";
617 }
618 }
619
620 if (i > n) {
621 i = n;
622 }
623 memcpy(dst.data.ptr + dst.meta.wi, p, i);
624 dst.meta.wi += i;
625 p += i;
626 n -= i;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100627 wrote_to_dst = true;
Nigel Tao1b073492020-02-16 22:11:36 +1100628 }
629 return nullptr;
630}
631
632// ----
633
Nigel Tao2914bae2020-02-26 09:40:30 +1100634uint8_t //
635hex_digit(uint8_t nibble) {
Nigel Taob5461bd2020-02-21 14:13:37 +1100636 nibble &= 0x0F;
637 if (nibble <= 9) {
638 return '0' + nibble;
639 }
640 return ('A' - 10) + nibble;
641}
642
Nigel Tao2914bae2020-02-26 09:40:30 +1100643const char* //
Nigel Tao3b486982020-02-27 15:05:59 +1100644handle_unicode_code_point(uint32_t ucp) {
645 if (ucp < 0x0020) {
646 switch (ucp) {
647 case '\b':
648 return write_dst("\\b", 2);
649 case '\f':
650 return write_dst("\\f", 2);
651 case '\n':
652 return write_dst("\\n", 2);
653 case '\r':
654 return write_dst("\\r", 2);
655 case '\t':
656 return write_dst("\\t", 2);
657 default: {
658 // Other bytes less than 0x0020 are valid UTF-8 but not valid in a
659 // JSON string. They need to remain escaped.
660 uint8_t esc6[6];
661 esc6[0] = '\\';
662 esc6[1] = 'u';
663 esc6[2] = '0';
664 esc6[3] = '0';
665 esc6[4] = hex_digit(ucp >> 4);
666 esc6[5] = hex_digit(ucp >> 0);
667 return write_dst(&esc6[0], 6);
668 }
669 }
670
Nigel Taob9ad34f2020-03-03 12:44:01 +1100671 } else if (ucp == '\"') {
672 return write_dst("\\\"", 2);
673
674 } else if (ucp == '\\') {
675 return write_dst("\\\\", 2);
676
677 } else {
678 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
679 size_t n = wuffs_base__utf_8__encode(
680 wuffs_base__make_slice_u8(&u[0],
681 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
682 ucp);
683 if (n > 0) {
684 return write_dst(&u[0], n);
Nigel Tao3b486982020-02-27 15:05:59 +1100685 }
Nigel Tao3b486982020-02-27 15:05:59 +1100686 }
687
Nigel Tao2cf76db2020-02-27 22:42:01 +1100688 return "main: internal error: unexpected Unicode code point";
Nigel Tao3b486982020-02-27 15:05:59 +1100689}
690
691const char* //
Nigel Tao2cf76db2020-02-27 22:42:01 +1100692handle_token(wuffs_base__token t) {
693 do {
694 uint64_t vbc = t.value_base_category();
695 uint64_t vbd = t.value_base_detail();
696 uint64_t len = t.length();
Nigel Tao1b073492020-02-16 22:11:36 +1100697
698 // Handle ']' or '}'.
Nigel Tao9f7a2502020-02-23 09:42:02 +1100699 if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
Nigel Tao2cf76db2020-02-27 22:42:01 +1100700 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100701 if (query.is_at(depth)) {
702 return "main: no match for query";
703 }
Nigel Tao1b073492020-02-16 22:11:36 +1100704 if (depth <= 0) {
705 return "main: internal error: inconsistent depth";
706 }
707 depth--;
708
709 // Write preceding whitespace.
710 if ((ctx != context::in_list_after_bracket) &&
Nigel Tao68920952020-03-03 11:25:18 +1100711 (ctx != context::in_dict_after_brace) && !flags.compact) {
Nigel Tao1b073492020-02-16 22:11:36 +1100712 TRY(write_dst("\n", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100713 for (uint32_t i = 0; i < depth; i++) {
Nigel Tao68920952020-03-03 11:25:18 +1100714 TRY(write_dst(flags.tabs ? INDENT_TABS_STRING : INDENT_SPACES_STRING,
715 flags.indent));
Nigel Tao1b073492020-02-16 22:11:36 +1100716 }
717 }
718
Nigel Tao9f7a2502020-02-23 09:42:02 +1100719 TRY(write_dst(
720 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST) ? "]" : "}", 1));
721 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
722 ? context::in_list_after_value
723 : context::in_dict_after_key;
Nigel Tao1b073492020-02-16 22:11:36 +1100724 goto after_value;
725 }
726
Nigel Taod1c928a2020-02-28 12:43:53 +1100727 // Write preceding whitespace and punctuation, if it wasn't ']', '}' or a
728 // continuation of a multi-token chain.
Nigel Tao0cd2f982020-03-03 23:03:02 +1100729 if (!t.link_prev()) {
730 if (ctx == context::in_dict_after_key) {
731 TRY(write_dst(": ", flags.compact ? 1 : 2));
732 } else if (ctx != context::none) {
733 if ((ctx != context::in_list_after_bracket) &&
734 (ctx != context::in_dict_after_brace)) {
735 TRY(write_dst(",", 1));
Nigel Tao107f0ef2020-03-01 21:35:02 +1100736 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100737 if (!flags.compact) {
738 TRY(write_dst("\n", 1));
739 for (size_t i = 0; i < depth; i++) {
740 TRY(write_dst(
741 flags.tabs ? INDENT_TABS_STRING : INDENT_SPACES_STRING,
742 flags.indent));
743 }
744 }
745 }
746
747 bool query_matched = false;
748 if (query.is_at(depth)) {
749 switch (ctx) {
750 case context::in_list_after_bracket:
751 case context::in_list_after_value:
752 query_matched = query.tick();
753 break;
754 case context::in_dict_after_key:
755 query_matched = query.matched();
756 break;
757 }
758 }
759 if (!query_matched) {
760 // No-op.
761 } else if (!query.next_fragment()) {
762 // There is no next fragment. We have matched the complete query, and
763 // the upcoming JSON value is the result of that query.
764 //
765 // Un-suppress writing to stdout and reset the ctx and depth as if we
766 // were about to decode a top-level value. This makes any subsequent
767 // indentation be relative to this point, and we will return eod after
768 // the upcoming JSON value is complete.
769 suppress_write_dst = false;
770 ctx = context::none;
771 depth = 0;
772 } else if ((vbc != WUFFS_BASE__TOKEN__VBC__STRUCTURE) ||
773 !(vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH)) {
774 // The query has moved on to the next fragment but the upcoming JSON
775 // value is not a container.
776 return "main: no match for query";
Nigel Tao1b073492020-02-16 22:11:36 +1100777 }
778 }
779
780 // Handle the token itself: either a container ('[' or '{') or a simple
Nigel Tao85fba7f2020-02-29 16:28:06 +1100781 // value: string (a chain of raw or escaped parts), literal or number.
Nigel Tao1b073492020-02-16 22:11:36 +1100782 switch (vbc) {
Nigel Tao85fba7f2020-02-29 16:28:06 +1100783 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
784 TRY(write_dst(
785 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) ? "[" : "{", 1));
786 depth++;
787 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
788 ? context::in_list_after_bracket
789 : context::in_dict_after_brace;
790 return nullptr;
791
Nigel Tao2cf76db2020-02-27 22:42:01 +1100792 case WUFFS_BASE__TOKEN__VBC__STRING:
Nigel Taod1c928a2020-02-28 12:43:53 +1100793 if (!t.link_prev()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100794 TRY(write_dst("\"", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100795 query.restart_fragment(in_dict_before_key() && query.is_at(depth));
Nigel Tao2cf76db2020-02-27 22:42:01 +1100796 }
Nigel Taocb37a562020-02-28 09:56:24 +1100797
798 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
799 // No-op.
800 } else if (vbd &
801 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100802 uint8_t* ptr = src.data.ptr + curr_token_end_src_index - len;
803 TRY(write_dst(ptr, len));
804 query.incremental_match_slice(ptr, len);
Nigel Taocb37a562020-02-28 09:56:24 +1100805 } else {
806 return "main: internal error: unexpected string-token conversion";
807 }
808
Nigel Taod1c928a2020-02-28 12:43:53 +1100809 if (t.link_next()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100810 return nullptr;
811 }
812 TRY(write_dst("\"", 1));
813 goto after_value;
814
815 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT:
Nigel Tao0cd2f982020-03-03 23:03:02 +1100816 if (!t.link_prev() || !t.link_next()) {
817 return "main: internal error: unexpected unlinked token";
818 }
819 TRY(handle_unicode_code_point(vbd));
820 query.incremental_match_code_point(vbd);
821 return nullptr;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100822
Nigel Tao85fba7f2020-02-29 16:28:06 +1100823 case WUFFS_BASE__TOKEN__VBC__LITERAL:
Nigel Tao2cf76db2020-02-27 22:42:01 +1100824 case WUFFS_BASE__TOKEN__VBC__NUMBER:
825 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
826 goto after_value;
Nigel Tao1b073492020-02-16 22:11:36 +1100827 }
828
829 // Return an error if we didn't match the (vbc, vbd) pair.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100830 return "main: internal error: unexpected token";
831 } while (0);
Nigel Tao1b073492020-02-16 22:11:36 +1100832
Nigel Tao2cf76db2020-02-27 22:42:01 +1100833 // Book-keeping after completing a value (whether a container value or a
834 // simple value). Empty parent containers are no longer empty. If the parent
835 // container is a "{...}" object, toggle between keys and values.
836after_value:
837 if (depth == 0) {
838 return eod;
839 }
840 switch (ctx) {
841 case context::in_list_after_bracket:
842 ctx = context::in_list_after_value;
843 break;
844 case context::in_dict_after_brace:
845 ctx = context::in_dict_after_key;
846 break;
847 case context::in_dict_after_key:
848 ctx = context::in_dict_after_value;
849 break;
850 case context::in_dict_after_value:
851 ctx = context::in_dict_after_key;
852 break;
853 }
854 return nullptr;
855}
856
857const char* //
858main1(int argc, char** argv) {
859 TRY(initialize_globals(argc, argv));
860
861 while (true) {
862 wuffs_base__status status = dec.decode_tokens(&tok, &src);
863
864 while (tok.meta.ri < tok.meta.wi) {
865 wuffs_base__token t = tok.data.ptr[tok.meta.ri++];
866 uint64_t n = t.length();
867 if ((src.meta.ri - curr_token_end_src_index) < n) {
868 return "main: internal error: inconsistent src indexes";
869 }
870 curr_token_end_src_index += n;
871
872 if (t.value() == 0) {
873 continue;
874 }
875
876 const char* z = handle_token(t);
877 if (z == nullptr) {
878 continue;
879 } else if (z == eod) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100880 goto end_of_data;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100881 }
882 return z;
Nigel Tao1b073492020-02-16 22:11:36 +1100883 }
Nigel Tao2cf76db2020-02-27 22:42:01 +1100884
885 if (status.repr == nullptr) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100886 return "main: internal error: unexpected end of token stream";
Nigel Tao2cf76db2020-02-27 22:42:01 +1100887 } else if (status.repr == wuffs_base__suspension__short_read) {
888 if (curr_token_end_src_index != src.meta.ri) {
889 return "main: internal error: inconsistent src indexes";
890 }
891 TRY(read_src());
892 curr_token_end_src_index = src.meta.ri;
893 } else if (status.repr == wuffs_base__suspension__short_write) {
894 tok.compact();
895 } else {
896 return status.message();
Nigel Tao1b073492020-02-16 22:11:36 +1100897 }
898 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100899end_of_data:
900
901 // With a non-empty query, don't try to consume trailing whitespace or
902 // confirm that we've processed all the tokens.
903 if (flags.query_c_string && *flags.query_c_string) {
904 return nullptr;
905 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100906
Nigel Tao6b161af2020-02-24 11:01:48 +1100907 // Consume an optional whitespace trailer. This isn't part of the JSON spec,
908 // but it works better with line oriented Unix tools (such as "echo 123 |
909 // jsonptr" where it's "echo", not "echo -n") or hand-edited JSON files which
910 // can accidentally contain trailing whitespace.
911 //
912 // A whitespace trailer is zero or more ' ' and then zero or one '\n'.
913 while (true) {
914 if (src.meta.ri < src.meta.wi) {
915 uint8_t c = src.data.ptr[src.meta.ri];
916 if (c == ' ') {
917 src.meta.ri++;
918 continue;
919 } else if (c == '\n') {
920 src.meta.ri++;
921 break;
922 }
923 // The "exhausted the input" check below will fail.
924 break;
925 } else if (src.meta.closed) {
926 break;
927 }
928 TRY(read_src());
929 }
930
931 // Check that we've exhausted the input.
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100932 if ((src.meta.ri == src.meta.wi) && !src.meta.closed) {
933 TRY(read_src());
934 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100935 if ((src.meta.ri < src.meta.wi) || !src.meta.closed) {
936 return "main: valid JSON followed by further (unexpected) data";
937 }
938
939 // Check that we've used all of the decoded tokens, other than trailing
940 // filler tokens. For example, a bare `"foo"` string is valid JSON, but even
941 // without a trailing '\n', the Wuffs JSON parser emits a filler token for
942 // the final '\"'.
943 for (; tok.meta.ri < tok.meta.wi; tok.meta.ri++) {
944 if (tok.data.ptr[tok.meta.ri].value_base_category() !=
945 WUFFS_BASE__TOKEN__VBC__FILLER) {
946 return "main: internal error: decoded OK but unprocessed tokens remain";
947 }
948 }
949
950 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +1100951}
952
Nigel Tao2914bae2020-02-26 09:40:30 +1100953int //
954compute_exit_code(const char* status_msg) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100955 if (!status_msg) {
956 return 0;
957 }
958 size_t n = strnlen(status_msg, 2047);
959 if (n >= 2047) {
960 status_msg = "main: internal error: error message is too long";
961 n = strnlen(status_msg, 2047);
962 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100963 const int stderr_fd = 2;
964 ignore_return_value(write(stderr_fd, status_msg, n));
965 ignore_return_value(write(stderr_fd, "\n", 1));
Nigel Tao9cc2c252020-02-23 17:05:49 +1100966 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
967 // formatted or unsupported input.
968 //
969 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
970 // run-time checks found that an internal invariant did not hold.
971 //
972 // Automated testing, including badly formatted inputs, can therefore
973 // discriminate between expected failure (exit code 1) and unexpected failure
974 // (other non-zero exit codes). Specifically, exit code 2 for internal
975 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
976 // linux) for a segmentation fault (e.g. null pointer dereference).
977 return strstr(status_msg, "internal error:") ? 2 : 1;
978}
979
Nigel Tao2914bae2020-02-26 09:40:30 +1100980int //
981main(int argc, char** argv) {
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100982#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
983 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
984 sandboxed = true;
985#endif
986
Nigel Tao0cd2f982020-03-03 23:03:02 +1100987 const char* z = main1(argc, argv);
988 if (wrote_to_dst) {
989 const char* z1 = write_dst("\n", 1);
990 const char* z2 = flush_dst();
991 z = z ? z : (z1 ? z1 : z2);
992 }
993 int exit_code = compute_exit_code(z);
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100994
995#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
996 // Call SYS_exit explicitly, instead of calling SYS_exit_group implicitly by
997 // either calling _exit or returning from main. SECCOMP_MODE_STRICT allows
998 // only SYS_exit.
999 syscall(SYS_exit, exit_code);
1000#endif
Nigel Tao9cc2c252020-02-23 17:05:49 +11001001 return exit_code;
Nigel Tao1b073492020-02-16 22:11:36 +11001002}