blob: ba66fc1797f05976b9ec08b2904215f7e2ec717c [file] [log] [blame]
Nigel Tao1b073492020-02-16 22:11:36 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
Nigel Tao0cd2f982020-03-03 23:03:02 +110018jsonptr is a JSON formatter (pretty-printer) that supports the JSON Pointer
19(RFC 6901) query syntax. It reads UTF-8 JSON from stdin and writes
20canonicalized, formatted UTF-8 JSON to stdout.
21
22See the "const char* usage" string below for details.
23
24----
25
26JSON Pointer (and this program's implementation) is one of many JSON query
27languages and JSON tools, such as jq, jql and JMESPath. This one is relatively
28simple and fewer-featured compared to those others.
29
30One benefit of simplicity is that this program's JSON and JSON Pointer
31implementations do not dynamically allocate or free memory (yet it does not
32require that the entire input fits in memory at once). They are therefore
33trivially protected against certain bug classes: memory leaks, double-frees and
34use-after-frees.
35
36The core JSON implementation is also written in the Wuffs programming language
37(and then transpiled to C/C++), which is memory-safe but also guards against
38integer arithmetic overflows.
39
Nigel Taofe0cbbd2020-03-05 22:01:30 +110040For defense in depth, on Linux, this program also self-imposes a
41SECCOMP_MODE_STRICT sandbox before reading (or otherwise processing) its input
42or writing its output. Under this sandbox, the only permitted system calls are
43read, write, exit and sigreturn.
44
Nigel Tao0cd2f982020-03-03 23:03:02 +110045All together, this program aims to safely handle untrusted JSON files without
46fear of security bugs such as remote code execution.
47
48----
Nigel Tao1b073492020-02-16 22:11:36 +110049
Nigel Taoc5b3a9e2020-02-24 11:54:35 +110050As of 2020-02-24, this program passes all 318 "test_parsing" cases from the
51JSON test suite (https://github.com/nst/JSONTestSuite), an appendix to the
52"Parsing JSON is a Minefield" article (http://seriot.ch/parsing_json.php) that
53was first published on 2016-10-26 and updated on 2018-03-30.
54
Nigel Tao0cd2f982020-03-03 23:03:02 +110055After modifying this program, run "build-example.sh example/jsonptr/" and then
56"script/run-json-test-suite.sh" to catch correctness regressions.
57
58----
59
Nigel Tao1b073492020-02-16 22:11:36 +110060This example program differs from most other example Wuffs programs in that it
61is written in C++, not C.
62
63$CXX jsonptr.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
64
65for a C++ compiler $CXX, such as clang++ or g++.
66*/
67
Nigel Taofe0cbbd2020-03-05 22:01:30 +110068#include <errno.h>
Nigel Tao01abc842020-03-06 21:42:33 +110069#include <fcntl.h>
70#include <stdio.h>
Nigel Tao9cc2c252020-02-23 17:05:49 +110071#include <string.h>
Nigel Taofe0cbbd2020-03-05 22:01:30 +110072#include <unistd.h>
Nigel Tao1b073492020-02-16 22:11:36 +110073
74// Wuffs ships as a "single file C library" or "header file library" as per
75// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
76//
77// To use that single file as a "foo.c"-like implementation, instead of a
78// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
79// compiling it.
80#define WUFFS_IMPLEMENTATION
81
82// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
83// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
84// the entire Wuffs standard library, implementing a variety of codecs and file
85// formats. Without this macro definition, an optimizing compiler or linker may
86// very well discard Wuffs code for unused codecs, but listing the Wuffs
87// modules we use makes that process explicit. Preprocessing means that such
88// code simply isn't compiled.
89#define WUFFS_CONFIG__MODULES
90#define WUFFS_CONFIG__MODULE__BASE
91#define WUFFS_CONFIG__MODULE__JSON
92
93// If building this program in an environment that doesn't easily accommodate
94// relative includes, you can use the script/inline-c-relative-includes.go
95// program to generate a stand-alone C++ file.
96#include "../../release/c/wuffs-unsupported-snapshot.c"
97
Nigel Taofe0cbbd2020-03-05 22:01:30 +110098#if defined(__linux__)
99#include <linux/prctl.h>
100#include <linux/seccomp.h>
101#include <sys/prctl.h>
102#include <sys/syscall.h>
103#define WUFFS_EXAMPLE_USE_SECCOMP
104#endif
105
Nigel Tao2cf76db2020-02-27 22:42:01 +1100106#define TRY(error_msg) \
107 do { \
108 const char* z = error_msg; \
109 if (z) { \
110 return z; \
111 } \
112 } while (false)
113
114static const char* eod = "main: end of data";
115
Nigel Tao0cd2f982020-03-03 23:03:02 +1100116static const char* usage =
Nigel Tao01abc842020-03-06 21:42:33 +1100117 "Usage: jsonptr -flags input.json\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100118 "\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100119 "The input.json filename is optional. If absent, it reads from stdin.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100120 "\n"
121 "jsonptr is a JSON formatter (pretty-printer) that supports the JSON\n"
122 "Pointer (RFC 6901) query syntax. It reads UTF-8 JSON from stdin and\n"
123 "writes canonicalized, formatted UTF-8 JSON to stdout.\n"
124 "\n"
125 "Canonicalized means that e.g. \"abc\\u000A\\tx\\u0177z\" is re-written\n"
126 "as \"abc\\n\\txŷz\". It does not sort object keys, nor does it reject\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100127 "duplicate keys. Canonicalization does not imply Unicode normalization.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100128 "\n"
129 "Formatted means that arrays' and objects' elements are indented, each\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100130 "on its own line. Configure this with the -c / -compact, -i=N / -indent=N\n"
131 "(for N ranging from 0 to 8) and -t / -tabs flags.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100132 "\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100133 "The -q=etc or -query=etc flag gives an optional JSON Pointer query, to\n"
134 "print a subset of the input. For example, given RFC 6901 section 5's\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100135 "sample input (https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
136 " jsonptr -query=/foo/1 rfc-6901-json-pointer.json\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100137 "will print:\n"
138 " \"baz\"\n"
139 "\n"
140 "An absent query is equivalent to the empty query, which identifies the\n"
141 "entire input (the root value). The \"/\" query is not equivalent to the\n"
142 "root value. Instead, it identifies the child (the key-value pair) of the\n"
143 "root value whose key is the empty string.\n"
144 "\n"
145 "If the query found a valid JSON value, this program will return a zero\n"
146 "exit code even if the rest of the input isn't valid JSON. If the query\n"
147 "did not find a value, or found an invalid one, this program returns a\n"
148 "non-zero exit code, but may still print partial output to stdout.\n"
149 "\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100150 "The JSON specification (https://json.org/) permits implementations that\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100151 "allow duplicate keys, as this one does. This JSON Pointer implementation\n"
152 "is also greedy, following the first match for each fragment without\n"
153 "back-tracking. For example, the \"/foo/bar\" query will fail if the root\n"
154 "object has multiple \"foo\" children but the first one doesn't have a\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100155 "\"bar\" child, even if later ones do.\n"
156 "\n"
157 "The -fail-if-unsandboxed flag causes the program to exit if it does not\n"
158 "self-impose a sandbox. On Linux, it self-imposes a SECCOMP_MODE_STRICT\n"
159 "sandbox, regardless of this flag.";
Nigel Tao0cd2f982020-03-03 23:03:02 +1100160
Nigel Tao2cf76db2020-02-27 22:42:01 +1100161// ----
162
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100163bool sandboxed = false;
164
Nigel Tao01abc842020-03-06 21:42:33 +1100165int input_file_descriptor = 0; // A 0 default means stdin.
166
Nigel Tao2cf76db2020-02-27 22:42:01 +1100167#define MAX_INDENT 8
Nigel Tao107f0ef2020-03-01 21:35:02 +1100168#define INDENT_SPACES_STRING " "
Nigel Tao6e7d1412020-03-06 09:21:35 +1100169#define INDENT_TAB_STRING "\t"
Nigel Tao107f0ef2020-03-01 21:35:02 +1100170
Nigel Taofdac24a2020-03-06 21:53:08 +1100171#ifndef DST_BUFFER_ARRAY_SIZE
172#define DST_BUFFER_ARRAY_SIZE (32 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100173#endif
Nigel Taofdac24a2020-03-06 21:53:08 +1100174#ifndef SRC_BUFFER_ARRAY_SIZE
175#define SRC_BUFFER_ARRAY_SIZE (32 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100176#endif
Nigel Taofdac24a2020-03-06 21:53:08 +1100177#ifndef TOKEN_BUFFER_ARRAY_SIZE
178#define TOKEN_BUFFER_ARRAY_SIZE (4 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100179#endif
180
Nigel Taofdac24a2020-03-06 21:53:08 +1100181uint8_t dst_array[DST_BUFFER_ARRAY_SIZE];
182uint8_t src_array[SRC_BUFFER_ARRAY_SIZE];
183wuffs_base__token tok_array[TOKEN_BUFFER_ARRAY_SIZE];
Nigel Tao1b073492020-02-16 22:11:36 +1100184
185wuffs_base__io_buffer dst;
186wuffs_base__io_buffer src;
187wuffs_base__token_buffer tok;
188
Nigel Tao2cf76db2020-02-27 22:42:01 +1100189// curr_token_end_src_index is the src.data.ptr index of the end of the current
190// token. An invariant is that (curr_token_end_src_index <= src.meta.ri).
191size_t curr_token_end_src_index;
192
Nigel Tao0cd2f982020-03-03 23:03:02 +1100193uint32_t depth;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100194
195enum class context {
196 none,
197 in_list_after_bracket,
198 in_list_after_value,
199 in_dict_after_brace,
200 in_dict_after_key,
201 in_dict_after_value,
202} ctx;
203
Nigel Tao0cd2f982020-03-03 23:03:02 +1100204bool //
205in_dict_before_key() {
206 return (ctx == context::in_dict_after_brace) ||
207 (ctx == context::in_dict_after_value);
208}
209
210bool suppress_write_dst;
211bool wrote_to_dst;
212
Nigel Tao1b073492020-02-16 22:11:36 +1100213wuffs_json__decoder dec;
Nigel Tao1b073492020-02-16 22:11:36 +1100214
Nigel Tao0cd2f982020-03-03 23:03:02 +1100215// ----
216
217// Query is a JSON Pointer query. After initializing with a NUL-terminated C
218// string, its multiple fragments are consumed as the program walks the JSON
219// data from stdin. For example, letting "$" denote a NUL, suppose that we
220// started with a query string of "/apple/banana/12/durian" and are currently
221// trying to match the second fragment, "banana", so that Query::depth is 2:
222//
223// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
224// / a p p l e / b a n a n a / 1 2 / d u r i a n $
225// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
226// ^ ^
227// frag_i frag_k
228//
229// The two pointers frag_i and frag_k are the start (inclusive) and end
230// (exclusive) of the fragment. They satisfy (frag_i <= frag_k) and may be
231// equal if the fragment empty (note that "" is a valid JSON object key).
232//
233// The frag_j pointer moves between these two, or is nullptr. An invariant is
234// that (((frag_i <= frag_j) && (frag_j <= frag_k)) || (frag_j == nullptr)).
235//
236// Wuffs' JSON tokenizer can portray a single JSON string as multiple Wuffs
237// tokens, as backslash-escaped values within that JSON string may each get
238// their own token.
239//
240// At the start of each object key (a JSON string), frag_j is set to frag_i.
241//
242// While frag_j remains non-nullptr, each token's unescaped contents are then
243// compared to that part of the fragment from frag_j to frag_k. If it is a
244// prefix (including the case of an exact match), then frag_j is advanced by
245// the unescaped length. Otherwise, frag_j is set to nullptr.
246//
247// Comparison accounts for JSON Pointer's escaping notation: "~0" and "~1" in
248// the query (not the JSON value) are unescaped to "~" and "/" respectively.
249//
250// The frag_j pointer therefore advances from frag_i to frag_k, or drops out,
251// as we incrementally match the object key with the query fragment. For
252// example, if we have already matched the "ban" of "banana", then we would
253// accept any of an "ana" token, an "a" token or a "\u0061" token, amongst
254// others. They would advance frag_j by 3, 1 or 1 bytes respectively.
255//
256// frag_j
257// v
258// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
259// / a p p l e / b a n a n a / 1 2 / d u r i a n $
260// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
261// ^ ^
262// frag_i frag_k
263//
264// At the end of each object key (or equivalently, at the start of each object
265// value), if frag_j is non-nullptr and equal to (but not less than) frag_k
266// then we have a fragment match: the query fragment equals the object key. If
267// there is a next fragment (in this example, "12") we move the frag_etc
268// pointers to its start and end and increment Query::depth. Otherwise, we have
269// matched the complete query, and the upcoming JSON value is the result of
270// that query.
271//
272// The discussion above centers on object keys. If the query fragment is
273// numeric then it can also match as an array index: the string fragment "12"
274// will match an array's 13th element (starting counting from zero). See RFC
275// 6901 for its precise definition of an "array index" number.
276//
277// Array index fragment match is represented by the Query::array_index field,
278// whose type (wuffs_base__result_u64) is a result type. An error result means
279// that the fragment is not an array index. A value result holds the number of
280// list elements remaining. When matching a query fragment in an array (instead
281// of in an object), each element ticks this number down towards zero. At zero,
282// the upcoming JSON value is the one that matches the query fragment.
283class Query {
284 private:
285 uint8_t* frag_i;
286 uint8_t* frag_j;
287 uint8_t* frag_k;
288
289 uint32_t depth;
290
291 wuffs_base__result_u64 array_index;
292
293 public:
294 void reset(char* query_c_string) {
295 this->frag_i = (uint8_t*)query_c_string;
296 this->frag_j = (uint8_t*)query_c_string;
297 this->frag_k = (uint8_t*)query_c_string;
298 this->depth = 0;
299 this->array_index.status.repr = "#main: not an array index query fragment";
300 this->array_index.value = 0;
301 }
302
303 void restart_fragment(bool enable) {
304 this->frag_j = enable ? this->frag_i : nullptr;
305 }
306
307 bool is_at(uint32_t depth) { return this->depth == depth; }
308
309 // tick returns whether the fragment is a valid array index whose value is
310 // zero. If valid but non-zero, it decrements it and returns false.
311 bool tick() {
312 if (this->array_index.status.is_ok()) {
313 if (this->array_index.value == 0) {
314 return true;
315 }
316 this->array_index.value--;
317 }
318 return false;
319 }
320
321 // next_fragment moves to the next fragment, returning whether it existed.
322 bool next_fragment() {
323 uint8_t* k = this->frag_k;
324 uint32_t d = this->depth;
325
326 this->reset(nullptr);
327
328 if (!k || (*k != '/')) {
329 return false;
330 }
331 k++;
332
333 bool all_digits = true;
334 uint8_t* i = k;
335 while ((*k != '\x00') && (*k != '/')) {
336 all_digits = all_digits && ('0' <= *k) && (*k <= '9');
337 k++;
338 }
339 this->frag_i = i;
340 this->frag_j = i;
341 this->frag_k = k;
342 this->depth = d + 1;
343 if (all_digits) {
344 // wuffs_base__parse_number_u64 rejects leading zeroes, e.g. "00", "07".
345 this->array_index =
346 wuffs_base__parse_number_u64(wuffs_base__make_slice_u8(i, k - i));
347 }
348 return true;
349 }
350
351 bool matched() { return this->frag_j && (this->frag_j == this->frag_k); }
352
353 void incremental_match_slice(uint8_t* ptr, size_t len) {
354 if (!this->frag_j) {
355 return;
356 }
357 uint8_t* j = this->frag_j;
358 while (true) {
359 if (len == 0) {
360 this->frag_j = j;
361 return;
362 }
363
364 if (*j == '\x00') {
365 break;
366
367 } else if (*j == '~') {
368 j++;
369 if (*j == '0') {
370 if (*ptr != '~') {
371 break;
372 }
373 } else if (*j == '1') {
374 if (*ptr != '/') {
375 break;
376 }
377 } else {
378 break;
379 }
380
381 } else if (*j != *ptr) {
382 break;
383 }
384
385 j++;
386 ptr++;
387 len--;
388 }
389 this->frag_j = nullptr;
390 }
391
392 void incremental_match_code_point(uint32_t code_point) {
393 if (!this->frag_j) {
394 return;
395 }
396 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
397 size_t n = wuffs_base__utf_8__encode(
398 wuffs_base__make_slice_u8(&u[0],
399 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
400 code_point);
401 if (n > 0) {
402 this->incremental_match_slice(&u[0], n);
403 }
404 }
405
406 // validate returns whether the (ptr, len) arguments form a valid JSON
407 // Pointer. In particular, it must be valid UTF-8, and either be empty or
408 // start with a '/'. Any '~' within must immediately be followed by either
409 // '0' or '1'.
410 static bool validate(char* query_c_string, size_t length) {
411 if (length <= 0) {
412 return true;
413 }
414 if (query_c_string[0] != '/') {
415 return false;
416 }
417 wuffs_base__slice_u8 s =
418 wuffs_base__make_slice_u8((uint8_t*)query_c_string, length);
419 bool previous_was_tilde = false;
420 while (s.len > 0) {
421 wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);
422 if (!o.is_valid()) {
423 return false;
424 }
425 if (previous_was_tilde && (o.code_point != '0') &&
426 (o.code_point != '1')) {
427 return false;
428 }
429 previous_was_tilde = o.code_point == '~';
430 s.ptr += o.byte_length;
431 s.len -= o.byte_length;
432 }
433 return !previous_was_tilde;
434 }
435} query;
436
437// ----
438
Nigel Tao68920952020-03-03 11:25:18 +1100439struct {
440 int remaining_argc;
441 char** remaining_argv;
442
443 bool compact;
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100444 bool fail_if_unsandboxed;
Nigel Tao68920952020-03-03 11:25:18 +1100445 size_t indent;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100446 char* query_c_string;
Nigel Tao68920952020-03-03 11:25:18 +1100447 bool tabs;
448} flags = {0};
449
450const char* //
451parse_flags(int argc, char** argv) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100452 flags.indent = 4;
Nigel Tao68920952020-03-03 11:25:18 +1100453
454 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
455 for (; c < argc; c++) {
456 char* arg = argv[c];
457 if (*arg++ != '-') {
458 break;
459 }
460
461 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
462 // cases, a bare "-" is not a flag (some programs may interpret it as
463 // stdin) and a bare "--" means to stop parsing flags.
464 if (*arg == '\x00') {
465 break;
466 } else if (*arg == '-') {
467 arg++;
468 if (*arg == '\x00') {
469 c++;
470 break;
471 }
472 }
473
474 if (!strcmp(arg, "c") || !strcmp(arg, "compact")) {
475 flags.compact = true;
476 continue;
477 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100478 if (!strcmp(arg, "fail-if-unsandboxed")) {
479 flags.fail_if_unsandboxed = true;
480 continue;
481 }
Nigel Tao68920952020-03-03 11:25:18 +1100482 if (!strncmp(arg, "i=", 2) || !strncmp(arg, "indent=", 7)) {
483 while (*arg++ != '=') {
484 }
485 if (('0' <= arg[0]) && (arg[0] <= '8') && (arg[1] == '\x00')) {
486 flags.indent = arg[0] - '0';
Nigel Tao68920952020-03-03 11:25:18 +1100487 continue;
488 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100489 return usage;
490 }
491 if (!strncmp(arg, "q=", 2) || !strncmp(arg, "query=", 6)) {
492 while (*arg++ != '=') {
493 }
494 if (Query::validate(arg, strlen(arg))) {
495 flags.query_c_string = arg;
496 continue;
497 }
498 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100499 }
500 if (!strcmp(arg, "t") || !strcmp(arg, "tabs")) {
501 flags.tabs = true;
502 continue;
503 }
504
Nigel Tao0cd2f982020-03-03 23:03:02 +1100505 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100506 }
507
508 flags.remaining_argc = argc - c;
509 flags.remaining_argv = argv + c;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100510 return nullptr;
Nigel Tao68920952020-03-03 11:25:18 +1100511}
512
Nigel Tao2cf76db2020-02-27 22:42:01 +1100513const char* //
514initialize_globals(int argc, char** argv) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100515 dst = wuffs_base__make_io_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100516 wuffs_base__make_slice_u8(dst_array, DST_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100517 wuffs_base__empty_io_buffer_meta());
Nigel Tao1b073492020-02-16 22:11:36 +1100518
Nigel Tao2cf76db2020-02-27 22:42:01 +1100519 src = wuffs_base__make_io_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100520 wuffs_base__make_slice_u8(src_array, SRC_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100521 wuffs_base__empty_io_buffer_meta());
522
523 tok = wuffs_base__make_token_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100524 wuffs_base__make_slice_token(tok_array, TOKEN_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100525 wuffs_base__empty_token_buffer_meta());
526
527 curr_token_end_src_index = 0;
528
Nigel Tao2cf76db2020-02-27 22:42:01 +1100529 depth = 0;
530
531 ctx = context::none;
532
Nigel Tao68920952020-03-03 11:25:18 +1100533 TRY(parse_flags(argc, argv));
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100534 if (flags.fail_if_unsandboxed && !sandboxed) {
535 return "main: unsandboxed";
536 }
Nigel Tao01abc842020-03-06 21:42:33 +1100537 const int stdin_fd = 0;
538 if (flags.remaining_argc > ((input_file_descriptor != stdin_fd) ? 1 : 0)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100539 return usage;
Nigel Tao107f0ef2020-03-01 21:35:02 +1100540 }
541
Nigel Tao0cd2f982020-03-03 23:03:02 +1100542 query.reset(flags.query_c_string);
543
544 // If the query is non-empty, suprress writing to stdout until we've
545 // completed the query.
546 suppress_write_dst = query.next_fragment();
547 wrote_to_dst = false;
548
Nigel Tao2cf76db2020-02-27 22:42:01 +1100549 return dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0)
550 .message();
551}
Nigel Tao1b073492020-02-16 22:11:36 +1100552
553// ----
554
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100555// ignore_return_value suppresses errors from -Wall -Werror.
556static void //
557ignore_return_value(int ignored) {}
558
Nigel Tao2914bae2020-02-26 09:40:30 +1100559const char* //
560read_src() {
Nigel Taoa8406922020-02-19 12:22:00 +1100561 if (src.meta.closed) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100562 return "main: internal error: read requested on a closed source";
Nigel Taoa8406922020-02-19 12:22:00 +1100563 }
Nigel Tao1b073492020-02-16 22:11:36 +1100564 src.compact();
565 if (src.meta.wi >= src.data.len) {
566 return "main: src buffer is full";
567 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100568 while (true) {
Nigel Tao01abc842020-03-06 21:42:33 +1100569 ssize_t n = read(input_file_descriptor, src.data.ptr + src.meta.wi,
570 src.data.len - src.meta.wi);
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100571 if (n >= 0) {
572 src.meta.wi += n;
573 src.meta.closed = n == 0;
574 break;
575 } else if (errno != EINTR) {
576 return strerror(errno);
577 }
Nigel Tao1b073492020-02-16 22:11:36 +1100578 }
579 return nullptr;
580}
581
Nigel Tao2914bae2020-02-26 09:40:30 +1100582const char* //
583flush_dst() {
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100584 while (true) {
585 size_t n = dst.meta.wi - dst.meta.ri;
586 if (n == 0) {
587 break;
Nigel Tao1b073492020-02-16 22:11:36 +1100588 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100589 const int stdout_fd = 1;
590 ssize_t i = write(stdout_fd, dst.data.ptr + dst.meta.ri, n);
591 if (i >= 0) {
592 dst.meta.ri += i;
593 } else if (errno != EINTR) {
594 return strerror(errno);
595 }
Nigel Tao1b073492020-02-16 22:11:36 +1100596 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100597 dst.compact();
Nigel Tao1b073492020-02-16 22:11:36 +1100598 return nullptr;
599}
600
Nigel Tao2914bae2020-02-26 09:40:30 +1100601const char* //
602write_dst(const void* s, size_t n) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100603 if (suppress_write_dst) {
604 return nullptr;
605 }
Nigel Tao1b073492020-02-16 22:11:36 +1100606 const uint8_t* p = static_cast<const uint8_t*>(s);
607 while (n > 0) {
608 size_t i = dst.writer_available();
609 if (i == 0) {
610 const char* z = flush_dst();
611 if (z) {
612 return z;
613 }
614 i = dst.writer_available();
615 if (i == 0) {
616 return "main: dst buffer is full";
617 }
618 }
619
620 if (i > n) {
621 i = n;
622 }
623 memcpy(dst.data.ptr + dst.meta.wi, p, i);
624 dst.meta.wi += i;
625 p += i;
626 n -= i;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100627 wrote_to_dst = true;
Nigel Tao1b073492020-02-16 22:11:36 +1100628 }
629 return nullptr;
630}
631
632// ----
633
Nigel Tao2914bae2020-02-26 09:40:30 +1100634uint8_t //
635hex_digit(uint8_t nibble) {
Nigel Taob5461bd2020-02-21 14:13:37 +1100636 nibble &= 0x0F;
637 if (nibble <= 9) {
638 return '0' + nibble;
639 }
640 return ('A' - 10) + nibble;
641}
642
Nigel Tao2914bae2020-02-26 09:40:30 +1100643const char* //
Nigel Tao3b486982020-02-27 15:05:59 +1100644handle_unicode_code_point(uint32_t ucp) {
645 if (ucp < 0x0020) {
646 switch (ucp) {
647 case '\b':
648 return write_dst("\\b", 2);
649 case '\f':
650 return write_dst("\\f", 2);
651 case '\n':
652 return write_dst("\\n", 2);
653 case '\r':
654 return write_dst("\\r", 2);
655 case '\t':
656 return write_dst("\\t", 2);
657 default: {
658 // Other bytes less than 0x0020 are valid UTF-8 but not valid in a
659 // JSON string. They need to remain escaped.
660 uint8_t esc6[6];
661 esc6[0] = '\\';
662 esc6[1] = 'u';
663 esc6[2] = '0';
664 esc6[3] = '0';
665 esc6[4] = hex_digit(ucp >> 4);
666 esc6[5] = hex_digit(ucp >> 0);
667 return write_dst(&esc6[0], 6);
668 }
669 }
670
Nigel Taob9ad34f2020-03-03 12:44:01 +1100671 } else if (ucp == '\"') {
672 return write_dst("\\\"", 2);
673
674 } else if (ucp == '\\') {
675 return write_dst("\\\\", 2);
676
677 } else {
678 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
679 size_t n = wuffs_base__utf_8__encode(
680 wuffs_base__make_slice_u8(&u[0],
681 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
682 ucp);
683 if (n > 0) {
684 return write_dst(&u[0], n);
Nigel Tao3b486982020-02-27 15:05:59 +1100685 }
Nigel Tao3b486982020-02-27 15:05:59 +1100686 }
687
Nigel Tao2cf76db2020-02-27 22:42:01 +1100688 return "main: internal error: unexpected Unicode code point";
Nigel Tao3b486982020-02-27 15:05:59 +1100689}
690
691const char* //
Nigel Tao2cf76db2020-02-27 22:42:01 +1100692handle_token(wuffs_base__token t) {
693 do {
694 uint64_t vbc = t.value_base_category();
695 uint64_t vbd = t.value_base_detail();
696 uint64_t len = t.length();
Nigel Tao1b073492020-02-16 22:11:36 +1100697
698 // Handle ']' or '}'.
Nigel Tao9f7a2502020-02-23 09:42:02 +1100699 if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
Nigel Tao2cf76db2020-02-27 22:42:01 +1100700 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100701 if (query.is_at(depth)) {
702 return "main: no match for query";
703 }
Nigel Tao1b073492020-02-16 22:11:36 +1100704 if (depth <= 0) {
705 return "main: internal error: inconsistent depth";
706 }
707 depth--;
708
709 // Write preceding whitespace.
710 if ((ctx != context::in_list_after_bracket) &&
Nigel Tao68920952020-03-03 11:25:18 +1100711 (ctx != context::in_dict_after_brace) && !flags.compact) {
Nigel Tao1b073492020-02-16 22:11:36 +1100712 TRY(write_dst("\n", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100713 for (uint32_t i = 0; i < depth; i++) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100714 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
715 flags.tabs ? 1 : flags.indent));
Nigel Tao1b073492020-02-16 22:11:36 +1100716 }
717 }
718
Nigel Tao9f7a2502020-02-23 09:42:02 +1100719 TRY(write_dst(
720 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST) ? "]" : "}", 1));
721 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
722 ? context::in_list_after_value
723 : context::in_dict_after_key;
Nigel Tao1b073492020-02-16 22:11:36 +1100724 goto after_value;
725 }
726
Nigel Taod1c928a2020-02-28 12:43:53 +1100727 // Write preceding whitespace and punctuation, if it wasn't ']', '}' or a
728 // continuation of a multi-token chain.
Nigel Tao0cd2f982020-03-03 23:03:02 +1100729 if (!t.link_prev()) {
730 if (ctx == context::in_dict_after_key) {
731 TRY(write_dst(": ", flags.compact ? 1 : 2));
732 } else if (ctx != context::none) {
733 if ((ctx != context::in_list_after_bracket) &&
734 (ctx != context::in_dict_after_brace)) {
735 TRY(write_dst(",", 1));
Nigel Tao107f0ef2020-03-01 21:35:02 +1100736 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100737 if (!flags.compact) {
738 TRY(write_dst("\n", 1));
739 for (size_t i = 0; i < depth; i++) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100740 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
741 flags.tabs ? 1 : flags.indent));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100742 }
743 }
744 }
745
746 bool query_matched = false;
747 if (query.is_at(depth)) {
748 switch (ctx) {
749 case context::in_list_after_bracket:
750 case context::in_list_after_value:
751 query_matched = query.tick();
752 break;
753 case context::in_dict_after_key:
754 query_matched = query.matched();
755 break;
756 }
757 }
758 if (!query_matched) {
759 // No-op.
760 } else if (!query.next_fragment()) {
761 // There is no next fragment. We have matched the complete query, and
762 // the upcoming JSON value is the result of that query.
763 //
764 // Un-suppress writing to stdout and reset the ctx and depth as if we
765 // were about to decode a top-level value. This makes any subsequent
766 // indentation be relative to this point, and we will return eod after
767 // the upcoming JSON value is complete.
768 suppress_write_dst = false;
769 ctx = context::none;
770 depth = 0;
771 } else if ((vbc != WUFFS_BASE__TOKEN__VBC__STRUCTURE) ||
772 !(vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH)) {
773 // The query has moved on to the next fragment but the upcoming JSON
774 // value is not a container.
775 return "main: no match for query";
Nigel Tao1b073492020-02-16 22:11:36 +1100776 }
777 }
778
779 // Handle the token itself: either a container ('[' or '{') or a simple
Nigel Tao85fba7f2020-02-29 16:28:06 +1100780 // value: string (a chain of raw or escaped parts), literal or number.
Nigel Tao1b073492020-02-16 22:11:36 +1100781 switch (vbc) {
Nigel Tao85fba7f2020-02-29 16:28:06 +1100782 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
783 TRY(write_dst(
784 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) ? "[" : "{", 1));
785 depth++;
786 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
787 ? context::in_list_after_bracket
788 : context::in_dict_after_brace;
789 return nullptr;
790
Nigel Tao2cf76db2020-02-27 22:42:01 +1100791 case WUFFS_BASE__TOKEN__VBC__STRING:
Nigel Taod1c928a2020-02-28 12:43:53 +1100792 if (!t.link_prev()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100793 TRY(write_dst("\"", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100794 query.restart_fragment(in_dict_before_key() && query.is_at(depth));
Nigel Tao2cf76db2020-02-27 22:42:01 +1100795 }
Nigel Taocb37a562020-02-28 09:56:24 +1100796
797 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
798 // No-op.
799 } else if (vbd &
800 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100801 uint8_t* ptr = src.data.ptr + curr_token_end_src_index - len;
802 TRY(write_dst(ptr, len));
803 query.incremental_match_slice(ptr, len);
Nigel Taocb37a562020-02-28 09:56:24 +1100804 } else {
805 return "main: internal error: unexpected string-token conversion";
806 }
807
Nigel Taod1c928a2020-02-28 12:43:53 +1100808 if (t.link_next()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100809 return nullptr;
810 }
811 TRY(write_dst("\"", 1));
812 goto after_value;
813
814 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT:
Nigel Tao0cd2f982020-03-03 23:03:02 +1100815 if (!t.link_prev() || !t.link_next()) {
816 return "main: internal error: unexpected unlinked token";
817 }
818 TRY(handle_unicode_code_point(vbd));
819 query.incremental_match_code_point(vbd);
820 return nullptr;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100821
Nigel Tao85fba7f2020-02-29 16:28:06 +1100822 case WUFFS_BASE__TOKEN__VBC__LITERAL:
Nigel Tao2cf76db2020-02-27 22:42:01 +1100823 case WUFFS_BASE__TOKEN__VBC__NUMBER:
824 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
825 goto after_value;
Nigel Tao1b073492020-02-16 22:11:36 +1100826 }
827
828 // Return an error if we didn't match the (vbc, vbd) pair.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100829 return "main: internal error: unexpected token";
830 } while (0);
Nigel Tao1b073492020-02-16 22:11:36 +1100831
Nigel Tao2cf76db2020-02-27 22:42:01 +1100832 // Book-keeping after completing a value (whether a container value or a
833 // simple value). Empty parent containers are no longer empty. If the parent
834 // container is a "{...}" object, toggle between keys and values.
835after_value:
836 if (depth == 0) {
837 return eod;
838 }
839 switch (ctx) {
840 case context::in_list_after_bracket:
841 ctx = context::in_list_after_value;
842 break;
843 case context::in_dict_after_brace:
844 ctx = context::in_dict_after_key;
845 break;
846 case context::in_dict_after_key:
847 ctx = context::in_dict_after_value;
848 break;
849 case context::in_dict_after_value:
850 ctx = context::in_dict_after_key;
851 break;
852 }
853 return nullptr;
854}
855
856const char* //
857main1(int argc, char** argv) {
858 TRY(initialize_globals(argc, argv));
859
860 while (true) {
861 wuffs_base__status status = dec.decode_tokens(&tok, &src);
862
863 while (tok.meta.ri < tok.meta.wi) {
864 wuffs_base__token t = tok.data.ptr[tok.meta.ri++];
865 uint64_t n = t.length();
866 if ((src.meta.ri - curr_token_end_src_index) < n) {
867 return "main: internal error: inconsistent src indexes";
868 }
869 curr_token_end_src_index += n;
870
871 if (t.value() == 0) {
872 continue;
873 }
874
875 const char* z = handle_token(t);
876 if (z == nullptr) {
877 continue;
878 } else if (z == eod) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100879 goto end_of_data;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100880 }
881 return z;
Nigel Tao1b073492020-02-16 22:11:36 +1100882 }
Nigel Tao2cf76db2020-02-27 22:42:01 +1100883
884 if (status.repr == nullptr) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100885 return "main: internal error: unexpected end of token stream";
Nigel Tao2cf76db2020-02-27 22:42:01 +1100886 } else if (status.repr == wuffs_base__suspension__short_read) {
887 if (curr_token_end_src_index != src.meta.ri) {
888 return "main: internal error: inconsistent src indexes";
889 }
890 TRY(read_src());
891 curr_token_end_src_index = src.meta.ri;
892 } else if (status.repr == wuffs_base__suspension__short_write) {
893 tok.compact();
894 } else {
895 return status.message();
Nigel Tao1b073492020-02-16 22:11:36 +1100896 }
897 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100898end_of_data:
899
900 // With a non-empty query, don't try to consume trailing whitespace or
901 // confirm that we've processed all the tokens.
902 if (flags.query_c_string && *flags.query_c_string) {
903 return nullptr;
904 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100905
Nigel Tao6b161af2020-02-24 11:01:48 +1100906 // Consume an optional whitespace trailer. This isn't part of the JSON spec,
907 // but it works better with line oriented Unix tools (such as "echo 123 |
908 // jsonptr" where it's "echo", not "echo -n") or hand-edited JSON files which
909 // can accidentally contain trailing whitespace.
910 //
911 // A whitespace trailer is zero or more ' ' and then zero or one '\n'.
912 while (true) {
913 if (src.meta.ri < src.meta.wi) {
914 uint8_t c = src.data.ptr[src.meta.ri];
915 if (c == ' ') {
916 src.meta.ri++;
917 continue;
918 } else if (c == '\n') {
919 src.meta.ri++;
920 break;
921 }
922 // The "exhausted the input" check below will fail.
923 break;
924 } else if (src.meta.closed) {
925 break;
926 }
927 TRY(read_src());
928 }
929
930 // Check that we've exhausted the input.
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100931 if ((src.meta.ri == src.meta.wi) && !src.meta.closed) {
932 TRY(read_src());
933 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100934 if ((src.meta.ri < src.meta.wi) || !src.meta.closed) {
935 return "main: valid JSON followed by further (unexpected) data";
936 }
937
938 // Check that we've used all of the decoded tokens, other than trailing
939 // filler tokens. For example, a bare `"foo"` string is valid JSON, but even
940 // without a trailing '\n', the Wuffs JSON parser emits a filler token for
941 // the final '\"'.
942 for (; tok.meta.ri < tok.meta.wi; tok.meta.ri++) {
943 if (tok.data.ptr[tok.meta.ri].value_base_category() !=
944 WUFFS_BASE__TOKEN__VBC__FILLER) {
945 return "main: internal error: decoded OK but unprocessed tokens remain";
946 }
947 }
948
949 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +1100950}
951
Nigel Tao2914bae2020-02-26 09:40:30 +1100952int //
953compute_exit_code(const char* status_msg) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100954 if (!status_msg) {
955 return 0;
956 }
Nigel Tao01abc842020-03-06 21:42:33 +1100957 size_t n;
958 if (status_msg == usage) {
959 n = strlen(status_msg);
960 } else {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100961 n = strnlen(status_msg, 2047);
Nigel Tao01abc842020-03-06 21:42:33 +1100962 if (n >= 2047) {
963 status_msg = "main: internal error: error message is too long";
964 n = strnlen(status_msg, 2047);
965 }
Nigel Tao9cc2c252020-02-23 17:05:49 +1100966 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100967 const int stderr_fd = 2;
968 ignore_return_value(write(stderr_fd, status_msg, n));
969 ignore_return_value(write(stderr_fd, "\n", 1));
Nigel Tao9cc2c252020-02-23 17:05:49 +1100970 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
971 // formatted or unsupported input.
972 //
973 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
974 // run-time checks found that an internal invariant did not hold.
975 //
976 // Automated testing, including badly formatted inputs, can therefore
977 // discriminate between expected failure (exit code 1) and unexpected failure
978 // (other non-zero exit codes). Specifically, exit code 2 for internal
979 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
980 // linux) for a segmentation fault (e.g. null pointer dereference).
981 return strstr(status_msg, "internal error:") ? 2 : 1;
982}
983
Nigel Tao2914bae2020-02-26 09:40:30 +1100984int //
985main(int argc, char** argv) {
Nigel Tao01abc842020-03-06 21:42:33 +1100986 // Look for an input filename (the first non-flag argument) in argv. If there
987 // is one, open it (but do not read from it) before we self-impose a sandbox.
988 //
989 // Flags start with "-", unless it comes after a bare "--" arg.
990 {
991 bool dash_dash = false;
992 int a;
993 for (a = 1; a < argc; a++) {
994 char* arg = argv[a];
995 if ((arg[0] == '-') && !dash_dash) {
996 dash_dash = (arg[1] == '-') && (arg[2] == '\x00');
997 continue;
998 }
999 input_file_descriptor = open(arg, O_RDONLY);
1000 if (input_file_descriptor < 0) {
1001 fprintf(stderr, "%s: %s\n", arg, strerror(errno));
1002 return 1;
1003 }
1004 break;
1005 }
1006 }
1007
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001008#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
1009 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
1010 sandboxed = true;
1011#endif
1012
Nigel Tao0cd2f982020-03-03 23:03:02 +11001013 const char* z = main1(argc, argv);
1014 if (wrote_to_dst) {
1015 const char* z1 = write_dst("\n", 1);
1016 const char* z2 = flush_dst();
1017 z = z ? z : (z1 ? z1 : z2);
1018 }
1019 int exit_code = compute_exit_code(z);
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001020
1021#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
1022 // Call SYS_exit explicitly, instead of calling SYS_exit_group implicitly by
1023 // either calling _exit or returning from main. SECCOMP_MODE_STRICT allows
1024 // only SYS_exit.
1025 syscall(SYS_exit, exit_code);
1026#endif
Nigel Tao9cc2c252020-02-23 17:05:49 +11001027 return exit_code;
Nigel Tao1b073492020-02-16 22:11:36 +11001028}