blob: 3345b57de648276a6520ccc9aa1ae55a8562d4b3 [file] [log] [blame]
Nigel Tao1b073492020-02-16 22:11:36 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
Nigel Tao0cd2f982020-03-03 23:03:02 +110018jsonptr is a JSON formatter (pretty-printer) that supports the JSON Pointer
19(RFC 6901) query syntax. It reads UTF-8 JSON from stdin and writes
20canonicalized, formatted UTF-8 JSON to stdout.
21
22See the "const char* usage" string below for details.
23
24----
25
26JSON Pointer (and this program's implementation) is one of many JSON query
27languages and JSON tools, such as jq, jql and JMESPath. This one is relatively
28simple and fewer-featured compared to those others.
29
30One benefit of simplicity is that this program's JSON and JSON Pointer
31implementations do not dynamically allocate or free memory (yet it does not
32require that the entire input fits in memory at once). They are therefore
33trivially protected against certain bug classes: memory leaks, double-frees and
34use-after-frees.
35
36The core JSON implementation is also written in the Wuffs programming language
37(and then transpiled to C/C++), which is memory-safe but also guards against
38integer arithmetic overflows.
39
Nigel Taofe0cbbd2020-03-05 22:01:30 +110040For defense in depth, on Linux, this program also self-imposes a
41SECCOMP_MODE_STRICT sandbox before reading (or otherwise processing) its input
42or writing its output. Under this sandbox, the only permitted system calls are
43read, write, exit and sigreturn.
44
Nigel Tao0cd2f982020-03-03 23:03:02 +110045All together, this program aims to safely handle untrusted JSON files without
46fear of security bugs such as remote code execution.
47
48----
Nigel Tao1b073492020-02-16 22:11:36 +110049
Nigel Taoc5b3a9e2020-02-24 11:54:35 +110050As of 2020-02-24, this program passes all 318 "test_parsing" cases from the
51JSON test suite (https://github.com/nst/JSONTestSuite), an appendix to the
52"Parsing JSON is a Minefield" article (http://seriot.ch/parsing_json.php) that
53was first published on 2016-10-26 and updated on 2018-03-30.
54
Nigel Tao0cd2f982020-03-03 23:03:02 +110055After modifying this program, run "build-example.sh example/jsonptr/" and then
56"script/run-json-test-suite.sh" to catch correctness regressions.
57
58----
59
Nigel Tao1b073492020-02-16 22:11:36 +110060This example program differs from most other example Wuffs programs in that it
61is written in C++, not C.
62
63$CXX jsonptr.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
64
65for a C++ compiler $CXX, such as clang++ or g++.
66*/
67
Nigel Taofe0cbbd2020-03-05 22:01:30 +110068#include <errno.h>
Nigel Tao01abc842020-03-06 21:42:33 +110069#include <fcntl.h>
70#include <stdio.h>
Nigel Tao9cc2c252020-02-23 17:05:49 +110071#include <string.h>
Nigel Taofe0cbbd2020-03-05 22:01:30 +110072#include <unistd.h>
Nigel Tao1b073492020-02-16 22:11:36 +110073
74// Wuffs ships as a "single file C library" or "header file library" as per
75// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
76//
77// To use that single file as a "foo.c"-like implementation, instead of a
78// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
79// compiling it.
80#define WUFFS_IMPLEMENTATION
81
82// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
83// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
84// the entire Wuffs standard library, implementing a variety of codecs and file
85// formats. Without this macro definition, an optimizing compiler or linker may
86// very well discard Wuffs code for unused codecs, but listing the Wuffs
87// modules we use makes that process explicit. Preprocessing means that such
88// code simply isn't compiled.
89#define WUFFS_CONFIG__MODULES
90#define WUFFS_CONFIG__MODULE__BASE
91#define WUFFS_CONFIG__MODULE__JSON
92
93// If building this program in an environment that doesn't easily accommodate
94// relative includes, you can use the script/inline-c-relative-includes.go
95// program to generate a stand-alone C++ file.
96#include "../../release/c/wuffs-unsupported-snapshot.c"
97
Nigel Taofe0cbbd2020-03-05 22:01:30 +110098#if defined(__linux__)
99#include <linux/prctl.h>
100#include <linux/seccomp.h>
101#include <sys/prctl.h>
102#include <sys/syscall.h>
103#define WUFFS_EXAMPLE_USE_SECCOMP
104#endif
105
Nigel Tao2cf76db2020-02-27 22:42:01 +1100106#define TRY(error_msg) \
107 do { \
108 const char* z = error_msg; \
109 if (z) { \
110 return z; \
111 } \
112 } while (false)
113
114static const char* eod = "main: end of data";
115
Nigel Tao0cd2f982020-03-03 23:03:02 +1100116static const char* usage =
Nigel Tao01abc842020-03-06 21:42:33 +1100117 "Usage: jsonptr -flags input.json\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100118 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100119 "Flags:\n"
120 " -c -compact\n"
121 " -i=NUM -indent=NUM\n"
122 " -o=NUM -max-output-depth=NUM\n"
123 " -q=STR -query=STR\n"
124 " -t -tabs\n"
125 " -fail-if-unsandboxed\n"
126 "\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100127 "The input.json filename is optional. If absent, it reads from stdin.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100128 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100129 "----\n"
130 "\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100131 "jsonptr is a JSON formatter (pretty-printer) that supports the JSON\n"
132 "Pointer (RFC 6901) query syntax. It reads UTF-8 JSON from stdin and\n"
133 "writes canonicalized, formatted UTF-8 JSON to stdout.\n"
134 "\n"
135 "Canonicalized means that e.g. \"abc\\u000A\\tx\\u0177z\" is re-written\n"
136 "as \"abc\\n\\txŷz\". It does not sort object keys, nor does it reject\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100137 "duplicate keys. Canonicalization does not imply Unicode normalization.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100138 "\n"
139 "Formatted means that arrays' and objects' elements are indented, each\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100140 "on its own line. Configure this with the -c / -compact, -i=NUM /\n"
141 "-indent=NUM (for NUM ranging from 0 to 8) and -t / -tabs flags.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100142 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100143 "----\n"
144 "\n"
145 "The -q=STR or -query=STR flag gives an optional JSON Pointer query, to\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100146 "print a subset of the input. For example, given RFC 6901 section 5's\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100147 "sample input (https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
148 " jsonptr -query=/foo/1 rfc-6901-json-pointer.json\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100149 "will print:\n"
150 " \"baz\"\n"
151 "\n"
152 "An absent query is equivalent to the empty query, which identifies the\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100153 "entire input (the root value). Unlike a file system, the \"/\" query\n"
154 "does not identify the root. Instead, it identifies the child (the value\n"
155 "in a key-value pair) of the root whose key is the empty string.\n"
156 "Similarly, \"/foo\" and \"/foo/\" identify two different nodes.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100157 "\n"
158 "If the query found a valid JSON value, this program will return a zero\n"
159 "exit code even if the rest of the input isn't valid JSON. If the query\n"
160 "did not find a value, or found an invalid one, this program returns a\n"
161 "non-zero exit code, but may still print partial output to stdout.\n"
162 "\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100163 "The JSON specification (https://json.org/) permits implementations that\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100164 "allow duplicate keys, as this one does. This JSON Pointer implementation\n"
165 "is also greedy, following the first match for each fragment without\n"
166 "back-tracking. For example, the \"/foo/bar\" query will fail if the root\n"
167 "object has multiple \"foo\" children but the first one doesn't have a\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100168 "\"bar\" child, even if later ones do.\n"
169 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100170 "----\n"
171 "\n"
172 "The -o=NUM or -max-output-depth=NUM flag gives the maximum (inclusive)\n"
173 "output depth. JSON containers ([] arrays and {} objects) can hold other\n"
174 "containers. When this flag is set, containers at depth NUM are replaced\n"
175 "with \"[…]\" or \"{…}\". A bare -o or -max-output-depth is equivalent to\n"
176 "NUM=1. The flag's absence is equivalent to an unlimited output depth.\n"
177 "\n"
178 "The -max-output-depth flag only affects the program's output. It doesn't\n"
179 "affect whether or not the input is considered valid JSON. The JSON\n"
180 "specification permits implementations to set their own maximum input\n"
181 "depth. This JSON implementation sets it to 1024.\n"
182 "\n"
183 "Depth is measured in terms of nested containers. It is unaffected by the\n"
184 "number of spaces or tabs used to indent.\n"
185 "\n"
186 "When both -max-output-depth and -query are set, the output depth is\n"
187 "measured from when the query resolves, not from the input root. The\n"
188 "input depth (measured from the root) is still limited to 1024.\n"
189 "\n"
190 "----\n"
191 "\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100192 "The -fail-if-unsandboxed flag causes the program to exit if it does not\n"
193 "self-impose a sandbox. On Linux, it self-imposes a SECCOMP_MODE_STRICT\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100194 "sandbox, regardless of whether this flag was set.";
Nigel Tao0cd2f982020-03-03 23:03:02 +1100195
Nigel Tao2cf76db2020-02-27 22:42:01 +1100196// ----
197
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100198bool sandboxed = false;
199
Nigel Tao01abc842020-03-06 21:42:33 +1100200int input_file_descriptor = 0; // A 0 default means stdin.
201
Nigel Tao2cf76db2020-02-27 22:42:01 +1100202#define MAX_INDENT 8
Nigel Tao107f0ef2020-03-01 21:35:02 +1100203#define INDENT_SPACES_STRING " "
Nigel Tao6e7d1412020-03-06 09:21:35 +1100204#define INDENT_TAB_STRING "\t"
Nigel Tao107f0ef2020-03-01 21:35:02 +1100205
Nigel Taofdac24a2020-03-06 21:53:08 +1100206#ifndef DST_BUFFER_ARRAY_SIZE
207#define DST_BUFFER_ARRAY_SIZE (32 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100208#endif
Nigel Taofdac24a2020-03-06 21:53:08 +1100209#ifndef SRC_BUFFER_ARRAY_SIZE
210#define SRC_BUFFER_ARRAY_SIZE (32 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100211#endif
Nigel Taofdac24a2020-03-06 21:53:08 +1100212#ifndef TOKEN_BUFFER_ARRAY_SIZE
213#define TOKEN_BUFFER_ARRAY_SIZE (4 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100214#endif
215
Nigel Taofdac24a2020-03-06 21:53:08 +1100216uint8_t dst_array[DST_BUFFER_ARRAY_SIZE];
217uint8_t src_array[SRC_BUFFER_ARRAY_SIZE];
218wuffs_base__token tok_array[TOKEN_BUFFER_ARRAY_SIZE];
Nigel Tao1b073492020-02-16 22:11:36 +1100219
220wuffs_base__io_buffer dst;
221wuffs_base__io_buffer src;
222wuffs_base__token_buffer tok;
223
Nigel Tao2cf76db2020-02-27 22:42:01 +1100224// curr_token_end_src_index is the src.data.ptr index of the end of the current
225// token. An invariant is that (curr_token_end_src_index <= src.meta.ri).
226size_t curr_token_end_src_index;
227
Nigel Tao0cd2f982020-03-03 23:03:02 +1100228uint32_t depth;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100229
230enum class context {
231 none,
232 in_list_after_bracket,
233 in_list_after_value,
234 in_dict_after_brace,
235 in_dict_after_key,
236 in_dict_after_value,
237} ctx;
238
Nigel Tao0cd2f982020-03-03 23:03:02 +1100239bool //
240in_dict_before_key() {
241 return (ctx == context::in_dict_after_brace) ||
242 (ctx == context::in_dict_after_value);
243}
244
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100245uint32_t suppress_write_dst;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100246bool wrote_to_dst;
247
Nigel Tao1b073492020-02-16 22:11:36 +1100248wuffs_json__decoder dec;
Nigel Tao1b073492020-02-16 22:11:36 +1100249
Nigel Tao0cd2f982020-03-03 23:03:02 +1100250// ----
251
252// Query is a JSON Pointer query. After initializing with a NUL-terminated C
253// string, its multiple fragments are consumed as the program walks the JSON
254// data from stdin. For example, letting "$" denote a NUL, suppose that we
255// started with a query string of "/apple/banana/12/durian" and are currently
256// trying to match the second fragment, "banana", so that Query::depth is 2:
257//
258// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
259// / a p p l e / b a n a n a / 1 2 / d u r i a n $
260// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
261// ^ ^
262// frag_i frag_k
263//
264// The two pointers frag_i and frag_k are the start (inclusive) and end
265// (exclusive) of the fragment. They satisfy (frag_i <= frag_k) and may be
266// equal if the fragment empty (note that "" is a valid JSON object key).
267//
268// The frag_j pointer moves between these two, or is nullptr. An invariant is
269// that (((frag_i <= frag_j) && (frag_j <= frag_k)) || (frag_j == nullptr)).
270//
271// Wuffs' JSON tokenizer can portray a single JSON string as multiple Wuffs
272// tokens, as backslash-escaped values within that JSON string may each get
273// their own token.
274//
275// At the start of each object key (a JSON string), frag_j is set to frag_i.
276//
277// While frag_j remains non-nullptr, each token's unescaped contents are then
278// compared to that part of the fragment from frag_j to frag_k. If it is a
279// prefix (including the case of an exact match), then frag_j is advanced by
280// the unescaped length. Otherwise, frag_j is set to nullptr.
281//
282// Comparison accounts for JSON Pointer's escaping notation: "~0" and "~1" in
283// the query (not the JSON value) are unescaped to "~" and "/" respectively.
284//
285// The frag_j pointer therefore advances from frag_i to frag_k, or drops out,
286// as we incrementally match the object key with the query fragment. For
287// example, if we have already matched the "ban" of "banana", then we would
288// accept any of an "ana" token, an "a" token or a "\u0061" token, amongst
289// others. They would advance frag_j by 3, 1 or 1 bytes respectively.
290//
291// frag_j
292// v
293// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
294// / a p p l e / b a n a n a / 1 2 / d u r i a n $
295// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
296// ^ ^
297// frag_i frag_k
298//
299// At the end of each object key (or equivalently, at the start of each object
300// value), if frag_j is non-nullptr and equal to (but not less than) frag_k
301// then we have a fragment match: the query fragment equals the object key. If
302// there is a next fragment (in this example, "12") we move the frag_etc
303// pointers to its start and end and increment Query::depth. Otherwise, we have
304// matched the complete query, and the upcoming JSON value is the result of
305// that query.
306//
307// The discussion above centers on object keys. If the query fragment is
308// numeric then it can also match as an array index: the string fragment "12"
309// will match an array's 13th element (starting counting from zero). See RFC
310// 6901 for its precise definition of an "array index" number.
311//
312// Array index fragment match is represented by the Query::array_index field,
313// whose type (wuffs_base__result_u64) is a result type. An error result means
314// that the fragment is not an array index. A value result holds the number of
315// list elements remaining. When matching a query fragment in an array (instead
316// of in an object), each element ticks this number down towards zero. At zero,
317// the upcoming JSON value is the one that matches the query fragment.
318class Query {
319 private:
320 uint8_t* frag_i;
321 uint8_t* frag_j;
322 uint8_t* frag_k;
323
324 uint32_t depth;
325
326 wuffs_base__result_u64 array_index;
327
328 public:
329 void reset(char* query_c_string) {
330 this->frag_i = (uint8_t*)query_c_string;
331 this->frag_j = (uint8_t*)query_c_string;
332 this->frag_k = (uint8_t*)query_c_string;
333 this->depth = 0;
334 this->array_index.status.repr = "#main: not an array index query fragment";
335 this->array_index.value = 0;
336 }
337
338 void restart_fragment(bool enable) {
339 this->frag_j = enable ? this->frag_i : nullptr;
340 }
341
342 bool is_at(uint32_t depth) { return this->depth == depth; }
343
344 // tick returns whether the fragment is a valid array index whose value is
345 // zero. If valid but non-zero, it decrements it and returns false.
346 bool tick() {
347 if (this->array_index.status.is_ok()) {
348 if (this->array_index.value == 0) {
349 return true;
350 }
351 this->array_index.value--;
352 }
353 return false;
354 }
355
356 // next_fragment moves to the next fragment, returning whether it existed.
357 bool next_fragment() {
358 uint8_t* k = this->frag_k;
359 uint32_t d = this->depth;
360
361 this->reset(nullptr);
362
363 if (!k || (*k != '/')) {
364 return false;
365 }
366 k++;
367
368 bool all_digits = true;
369 uint8_t* i = k;
370 while ((*k != '\x00') && (*k != '/')) {
371 all_digits = all_digits && ('0' <= *k) && (*k <= '9');
372 k++;
373 }
374 this->frag_i = i;
375 this->frag_j = i;
376 this->frag_k = k;
377 this->depth = d + 1;
378 if (all_digits) {
379 // wuffs_base__parse_number_u64 rejects leading zeroes, e.g. "00", "07".
380 this->array_index =
381 wuffs_base__parse_number_u64(wuffs_base__make_slice_u8(i, k - i));
382 }
383 return true;
384 }
385
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100386 bool matched_all() { return this->frag_k == nullptr; }
387
388 bool matched_fragment() {
389 return this->frag_j && (this->frag_j == this->frag_k);
390 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100391
392 void incremental_match_slice(uint8_t* ptr, size_t len) {
393 if (!this->frag_j) {
394 return;
395 }
396 uint8_t* j = this->frag_j;
397 while (true) {
398 if (len == 0) {
399 this->frag_j = j;
400 return;
401 }
402
403 if (*j == '\x00') {
404 break;
405
406 } else if (*j == '~') {
407 j++;
408 if (*j == '0') {
409 if (*ptr != '~') {
410 break;
411 }
412 } else if (*j == '1') {
413 if (*ptr != '/') {
414 break;
415 }
416 } else {
417 break;
418 }
419
420 } else if (*j != *ptr) {
421 break;
422 }
423
424 j++;
425 ptr++;
426 len--;
427 }
428 this->frag_j = nullptr;
429 }
430
431 void incremental_match_code_point(uint32_t code_point) {
432 if (!this->frag_j) {
433 return;
434 }
435 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
436 size_t n = wuffs_base__utf_8__encode(
437 wuffs_base__make_slice_u8(&u[0],
438 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
439 code_point);
440 if (n > 0) {
441 this->incremental_match_slice(&u[0], n);
442 }
443 }
444
445 // validate returns whether the (ptr, len) arguments form a valid JSON
446 // Pointer. In particular, it must be valid UTF-8, and either be empty or
447 // start with a '/'. Any '~' within must immediately be followed by either
448 // '0' or '1'.
449 static bool validate(char* query_c_string, size_t length) {
450 if (length <= 0) {
451 return true;
452 }
453 if (query_c_string[0] != '/') {
454 return false;
455 }
456 wuffs_base__slice_u8 s =
457 wuffs_base__make_slice_u8((uint8_t*)query_c_string, length);
458 bool previous_was_tilde = false;
459 while (s.len > 0) {
460 wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);
461 if (!o.is_valid()) {
462 return false;
463 }
464 if (previous_was_tilde && (o.code_point != '0') &&
465 (o.code_point != '1')) {
466 return false;
467 }
468 previous_was_tilde = o.code_point == '~';
469 s.ptr += o.byte_length;
470 s.len -= o.byte_length;
471 }
472 return !previous_was_tilde;
473 }
474} query;
475
476// ----
477
Nigel Tao68920952020-03-03 11:25:18 +1100478struct {
479 int remaining_argc;
480 char** remaining_argv;
481
482 bool compact;
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100483 bool fail_if_unsandboxed;
Nigel Tao68920952020-03-03 11:25:18 +1100484 size_t indent;
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100485 uint32_t max_output_depth;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100486 char* query_c_string;
Nigel Tao68920952020-03-03 11:25:18 +1100487 bool tabs;
488} flags = {0};
489
490const char* //
491parse_flags(int argc, char** argv) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100492 flags.indent = 4;
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100493 flags.max_output_depth = 0xFFFFFFFF;
Nigel Tao68920952020-03-03 11:25:18 +1100494
495 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
496 for (; c < argc; c++) {
497 char* arg = argv[c];
498 if (*arg++ != '-') {
499 break;
500 }
501
502 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
503 // cases, a bare "-" is not a flag (some programs may interpret it as
504 // stdin) and a bare "--" means to stop parsing flags.
505 if (*arg == '\x00') {
506 break;
507 } else if (*arg == '-') {
508 arg++;
509 if (*arg == '\x00') {
510 c++;
511 break;
512 }
513 }
514
515 if (!strcmp(arg, "c") || !strcmp(arg, "compact")) {
516 flags.compact = true;
517 continue;
518 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100519 if (!strcmp(arg, "fail-if-unsandboxed")) {
520 flags.fail_if_unsandboxed = true;
521 continue;
522 }
Nigel Tao68920952020-03-03 11:25:18 +1100523 if (!strncmp(arg, "i=", 2) || !strncmp(arg, "indent=", 7)) {
524 while (*arg++ != '=') {
525 }
526 if (('0' <= arg[0]) && (arg[0] <= '8') && (arg[1] == '\x00')) {
527 flags.indent = arg[0] - '0';
Nigel Tao68920952020-03-03 11:25:18 +1100528 continue;
529 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100530 return usage;
531 }
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100532 if (!strcmp(arg, "o") || !strcmp(arg, "max-output-depth")) {
533 flags.max_output_depth = 1;
534 continue;
535 } else if (!strncmp(arg, "o=", 2) ||
536 !strncmp(arg, "max-output-depth=", 16)) {
537 while (*arg++ != '=') {
538 }
539 wuffs_base__result_u64 u = wuffs_base__parse_number_u64(
540 wuffs_base__make_slice_u8((uint8_t*)arg, strlen(arg)));
541 if (wuffs_base__status__is_ok(&u.status) && (u.value <= 0xFFFFFFFF)) {
542 flags.max_output_depth = (uint32_t)(u.value);
543 continue;
544 }
545 return usage;
546 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100547 if (!strncmp(arg, "q=", 2) || !strncmp(arg, "query=", 6)) {
548 while (*arg++ != '=') {
549 }
550 if (Query::validate(arg, strlen(arg))) {
551 flags.query_c_string = arg;
552 continue;
553 }
554 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100555 }
556 if (!strcmp(arg, "t") || !strcmp(arg, "tabs")) {
557 flags.tabs = true;
558 continue;
559 }
560
Nigel Tao0cd2f982020-03-03 23:03:02 +1100561 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100562 }
563
564 flags.remaining_argc = argc - c;
565 flags.remaining_argv = argv + c;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100566 return nullptr;
Nigel Tao68920952020-03-03 11:25:18 +1100567}
568
Nigel Tao2cf76db2020-02-27 22:42:01 +1100569const char* //
570initialize_globals(int argc, char** argv) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100571 dst = wuffs_base__make_io_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100572 wuffs_base__make_slice_u8(dst_array, DST_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100573 wuffs_base__empty_io_buffer_meta());
Nigel Tao1b073492020-02-16 22:11:36 +1100574
Nigel Tao2cf76db2020-02-27 22:42:01 +1100575 src = wuffs_base__make_io_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100576 wuffs_base__make_slice_u8(src_array, SRC_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100577 wuffs_base__empty_io_buffer_meta());
578
579 tok = wuffs_base__make_token_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100580 wuffs_base__make_slice_token(tok_array, TOKEN_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100581 wuffs_base__empty_token_buffer_meta());
582
583 curr_token_end_src_index = 0;
584
Nigel Tao2cf76db2020-02-27 22:42:01 +1100585 depth = 0;
586
587 ctx = context::none;
588
Nigel Tao68920952020-03-03 11:25:18 +1100589 TRY(parse_flags(argc, argv));
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100590 if (flags.fail_if_unsandboxed && !sandboxed) {
591 return "main: unsandboxed";
592 }
Nigel Tao01abc842020-03-06 21:42:33 +1100593 const int stdin_fd = 0;
594 if (flags.remaining_argc > ((input_file_descriptor != stdin_fd) ? 1 : 0)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100595 return usage;
Nigel Tao107f0ef2020-03-01 21:35:02 +1100596 }
597
Nigel Tao0cd2f982020-03-03 23:03:02 +1100598 query.reset(flags.query_c_string);
599
600 // If the query is non-empty, suprress writing to stdout until we've
601 // completed the query.
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100602 suppress_write_dst = query.next_fragment() ? 1 : 0;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100603 wrote_to_dst = false;
604
Nigel Tao2cf76db2020-02-27 22:42:01 +1100605 return dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0)
606 .message();
607}
Nigel Tao1b073492020-02-16 22:11:36 +1100608
609// ----
610
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100611// ignore_return_value suppresses errors from -Wall -Werror.
612static void //
613ignore_return_value(int ignored) {}
614
Nigel Tao2914bae2020-02-26 09:40:30 +1100615const char* //
616read_src() {
Nigel Taoa8406922020-02-19 12:22:00 +1100617 if (src.meta.closed) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100618 return "main: internal error: read requested on a closed source";
Nigel Taoa8406922020-02-19 12:22:00 +1100619 }
Nigel Tao1b073492020-02-16 22:11:36 +1100620 src.compact();
621 if (src.meta.wi >= src.data.len) {
622 return "main: src buffer is full";
623 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100624 while (true) {
Nigel Tao01abc842020-03-06 21:42:33 +1100625 ssize_t n = read(input_file_descriptor, src.data.ptr + src.meta.wi,
626 src.data.len - src.meta.wi);
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100627 if (n >= 0) {
628 src.meta.wi += n;
629 src.meta.closed = n == 0;
630 break;
631 } else if (errno != EINTR) {
632 return strerror(errno);
633 }
Nigel Tao1b073492020-02-16 22:11:36 +1100634 }
635 return nullptr;
636}
637
Nigel Tao2914bae2020-02-26 09:40:30 +1100638const char* //
639flush_dst() {
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100640 while (true) {
641 size_t n = dst.meta.wi - dst.meta.ri;
642 if (n == 0) {
643 break;
Nigel Tao1b073492020-02-16 22:11:36 +1100644 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100645 const int stdout_fd = 1;
646 ssize_t i = write(stdout_fd, dst.data.ptr + dst.meta.ri, n);
647 if (i >= 0) {
648 dst.meta.ri += i;
649 } else if (errno != EINTR) {
650 return strerror(errno);
651 }
Nigel Tao1b073492020-02-16 22:11:36 +1100652 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100653 dst.compact();
Nigel Tao1b073492020-02-16 22:11:36 +1100654 return nullptr;
655}
656
Nigel Tao2914bae2020-02-26 09:40:30 +1100657const char* //
658write_dst(const void* s, size_t n) {
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100659 if (suppress_write_dst > 0) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100660 return nullptr;
661 }
Nigel Tao1b073492020-02-16 22:11:36 +1100662 const uint8_t* p = static_cast<const uint8_t*>(s);
663 while (n > 0) {
664 size_t i = dst.writer_available();
665 if (i == 0) {
666 const char* z = flush_dst();
667 if (z) {
668 return z;
669 }
670 i = dst.writer_available();
671 if (i == 0) {
672 return "main: dst buffer is full";
673 }
674 }
675
676 if (i > n) {
677 i = n;
678 }
679 memcpy(dst.data.ptr + dst.meta.wi, p, i);
680 dst.meta.wi += i;
681 p += i;
682 n -= i;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100683 wrote_to_dst = true;
Nigel Tao1b073492020-02-16 22:11:36 +1100684 }
685 return nullptr;
686}
687
688// ----
689
Nigel Tao2914bae2020-02-26 09:40:30 +1100690uint8_t //
691hex_digit(uint8_t nibble) {
Nigel Taob5461bd2020-02-21 14:13:37 +1100692 nibble &= 0x0F;
693 if (nibble <= 9) {
694 return '0' + nibble;
695 }
696 return ('A' - 10) + nibble;
697}
698
Nigel Tao2914bae2020-02-26 09:40:30 +1100699const char* //
Nigel Tao3b486982020-02-27 15:05:59 +1100700handle_unicode_code_point(uint32_t ucp) {
701 if (ucp < 0x0020) {
702 switch (ucp) {
703 case '\b':
704 return write_dst("\\b", 2);
705 case '\f':
706 return write_dst("\\f", 2);
707 case '\n':
708 return write_dst("\\n", 2);
709 case '\r':
710 return write_dst("\\r", 2);
711 case '\t':
712 return write_dst("\\t", 2);
713 default: {
714 // Other bytes less than 0x0020 are valid UTF-8 but not valid in a
715 // JSON string. They need to remain escaped.
716 uint8_t esc6[6];
717 esc6[0] = '\\';
718 esc6[1] = 'u';
719 esc6[2] = '0';
720 esc6[3] = '0';
721 esc6[4] = hex_digit(ucp >> 4);
722 esc6[5] = hex_digit(ucp >> 0);
723 return write_dst(&esc6[0], 6);
724 }
725 }
726
Nigel Taob9ad34f2020-03-03 12:44:01 +1100727 } else if (ucp == '\"') {
728 return write_dst("\\\"", 2);
729
730 } else if (ucp == '\\') {
731 return write_dst("\\\\", 2);
732
733 } else {
734 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
735 size_t n = wuffs_base__utf_8__encode(
736 wuffs_base__make_slice_u8(&u[0],
737 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
738 ucp);
739 if (n > 0) {
740 return write_dst(&u[0], n);
Nigel Tao3b486982020-02-27 15:05:59 +1100741 }
Nigel Tao3b486982020-02-27 15:05:59 +1100742 }
743
Nigel Tao2cf76db2020-02-27 22:42:01 +1100744 return "main: internal error: unexpected Unicode code point";
Nigel Tao3b486982020-02-27 15:05:59 +1100745}
746
747const char* //
Nigel Tao2cf76db2020-02-27 22:42:01 +1100748handle_token(wuffs_base__token t) {
749 do {
750 uint64_t vbc = t.value_base_category();
751 uint64_t vbd = t.value_base_detail();
752 uint64_t len = t.length();
Nigel Tao1b073492020-02-16 22:11:36 +1100753
754 // Handle ']' or '}'.
Nigel Tao9f7a2502020-02-23 09:42:02 +1100755 if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
Nigel Tao2cf76db2020-02-27 22:42:01 +1100756 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100757 if (query.is_at(depth)) {
758 return "main: no match for query";
759 }
Nigel Tao1b073492020-02-16 22:11:36 +1100760 if (depth <= 0) {
761 return "main: internal error: inconsistent depth";
762 }
763 depth--;
764
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100765 if (query.matched_all() && (depth >= flags.max_output_depth)) {
766 suppress_write_dst--;
767 // '…' is U+2026 HORIZONTAL ELLIPSIS, which is 3 UTF-8 bytes.
768 TRY(write_dst((vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST)
769 ? "\"[…]\""
770 : "\"{…}\"",
771 7));
772 } else {
773 // Write preceding whitespace.
774 if ((ctx != context::in_list_after_bracket) &&
775 (ctx != context::in_dict_after_brace) && !flags.compact) {
776 TRY(write_dst("\n", 1));
777 for (uint32_t i = 0; i < depth; i++) {
778 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
779 flags.tabs ? 1 : flags.indent));
780 }
Nigel Tao1b073492020-02-16 22:11:36 +1100781 }
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100782
783 TRY(write_dst(
784 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST) ? "]" : "}",
785 1));
Nigel Tao1b073492020-02-16 22:11:36 +1100786 }
787
Nigel Tao9f7a2502020-02-23 09:42:02 +1100788 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
789 ? context::in_list_after_value
790 : context::in_dict_after_key;
Nigel Tao1b073492020-02-16 22:11:36 +1100791 goto after_value;
792 }
793
Nigel Taod1c928a2020-02-28 12:43:53 +1100794 // Write preceding whitespace and punctuation, if it wasn't ']', '}' or a
795 // continuation of a multi-token chain.
Nigel Tao0cd2f982020-03-03 23:03:02 +1100796 if (!t.link_prev()) {
797 if (ctx == context::in_dict_after_key) {
798 TRY(write_dst(": ", flags.compact ? 1 : 2));
799 } else if (ctx != context::none) {
800 if ((ctx != context::in_list_after_bracket) &&
801 (ctx != context::in_dict_after_brace)) {
802 TRY(write_dst(",", 1));
Nigel Tao107f0ef2020-03-01 21:35:02 +1100803 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100804 if (!flags.compact) {
805 TRY(write_dst("\n", 1));
806 for (size_t i = 0; i < depth; i++) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100807 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
808 flags.tabs ? 1 : flags.indent));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100809 }
810 }
811 }
812
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100813 bool query_matched_fragment = false;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100814 if (query.is_at(depth)) {
815 switch (ctx) {
816 case context::in_list_after_bracket:
817 case context::in_list_after_value:
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100818 query_matched_fragment = query.tick();
Nigel Tao0cd2f982020-03-03 23:03:02 +1100819 break;
820 case context::in_dict_after_key:
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100821 query_matched_fragment = query.matched_fragment();
Nigel Tao0cd2f982020-03-03 23:03:02 +1100822 break;
823 }
824 }
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100825 if (!query_matched_fragment) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100826 // No-op.
827 } else if (!query.next_fragment()) {
828 // There is no next fragment. We have matched the complete query, and
829 // the upcoming JSON value is the result of that query.
830 //
831 // Un-suppress writing to stdout and reset the ctx and depth as if we
832 // were about to decode a top-level value. This makes any subsequent
833 // indentation be relative to this point, and we will return eod after
834 // the upcoming JSON value is complete.
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100835 if (suppress_write_dst != 1) {
836 return "main: internal error: inconsistent suppress_write_dst";
837 }
838 suppress_write_dst = 0;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100839 ctx = context::none;
840 depth = 0;
841 } else if ((vbc != WUFFS_BASE__TOKEN__VBC__STRUCTURE) ||
842 !(vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH)) {
843 // The query has moved on to the next fragment but the upcoming JSON
844 // value is not a container.
845 return "main: no match for query";
Nigel Tao1b073492020-02-16 22:11:36 +1100846 }
847 }
848
849 // Handle the token itself: either a container ('[' or '{') or a simple
Nigel Tao85fba7f2020-02-29 16:28:06 +1100850 // value: string (a chain of raw or escaped parts), literal or number.
Nigel Tao1b073492020-02-16 22:11:36 +1100851 switch (vbc) {
Nigel Tao85fba7f2020-02-29 16:28:06 +1100852 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100853 if (query.matched_all() && (depth >= flags.max_output_depth)) {
854 suppress_write_dst++;
855 } else {
856 TRY(write_dst(
857 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) ? "[" : "{",
858 1));
859 }
Nigel Tao85fba7f2020-02-29 16:28:06 +1100860 depth++;
861 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
862 ? context::in_list_after_bracket
863 : context::in_dict_after_brace;
864 return nullptr;
865
Nigel Tao2cf76db2020-02-27 22:42:01 +1100866 case WUFFS_BASE__TOKEN__VBC__STRING:
Nigel Taod1c928a2020-02-28 12:43:53 +1100867 if (!t.link_prev()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100868 TRY(write_dst("\"", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100869 query.restart_fragment(in_dict_before_key() && query.is_at(depth));
Nigel Tao2cf76db2020-02-27 22:42:01 +1100870 }
Nigel Taocb37a562020-02-28 09:56:24 +1100871
872 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
873 // No-op.
874 } else if (vbd &
875 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100876 uint8_t* ptr = src.data.ptr + curr_token_end_src_index - len;
877 TRY(write_dst(ptr, len));
878 query.incremental_match_slice(ptr, len);
Nigel Taocb37a562020-02-28 09:56:24 +1100879 } else {
880 return "main: internal error: unexpected string-token conversion";
881 }
882
Nigel Taod1c928a2020-02-28 12:43:53 +1100883 if (t.link_next()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100884 return nullptr;
885 }
886 TRY(write_dst("\"", 1));
887 goto after_value;
888
889 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT:
Nigel Tao0cd2f982020-03-03 23:03:02 +1100890 if (!t.link_prev() || !t.link_next()) {
891 return "main: internal error: unexpected unlinked token";
892 }
893 TRY(handle_unicode_code_point(vbd));
894 query.incremental_match_code_point(vbd);
895 return nullptr;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100896
Nigel Tao85fba7f2020-02-29 16:28:06 +1100897 case WUFFS_BASE__TOKEN__VBC__LITERAL:
Nigel Tao2cf76db2020-02-27 22:42:01 +1100898 case WUFFS_BASE__TOKEN__VBC__NUMBER:
899 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
900 goto after_value;
Nigel Tao1b073492020-02-16 22:11:36 +1100901 }
902
903 // Return an error if we didn't match the (vbc, vbd) pair.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100904 return "main: internal error: unexpected token";
905 } while (0);
Nigel Tao1b073492020-02-16 22:11:36 +1100906
Nigel Tao2cf76db2020-02-27 22:42:01 +1100907 // Book-keeping after completing a value (whether a container value or a
908 // simple value). Empty parent containers are no longer empty. If the parent
909 // container is a "{...}" object, toggle between keys and values.
910after_value:
911 if (depth == 0) {
912 return eod;
913 }
914 switch (ctx) {
915 case context::in_list_after_bracket:
916 ctx = context::in_list_after_value;
917 break;
918 case context::in_dict_after_brace:
919 ctx = context::in_dict_after_key;
920 break;
921 case context::in_dict_after_key:
922 ctx = context::in_dict_after_value;
923 break;
924 case context::in_dict_after_value:
925 ctx = context::in_dict_after_key;
926 break;
927 }
928 return nullptr;
929}
930
931const char* //
932main1(int argc, char** argv) {
933 TRY(initialize_globals(argc, argv));
934
935 while (true) {
936 wuffs_base__status status = dec.decode_tokens(&tok, &src);
937
938 while (tok.meta.ri < tok.meta.wi) {
939 wuffs_base__token t = tok.data.ptr[tok.meta.ri++];
940 uint64_t n = t.length();
941 if ((src.meta.ri - curr_token_end_src_index) < n) {
942 return "main: internal error: inconsistent src indexes";
943 }
944 curr_token_end_src_index += n;
945
946 if (t.value() == 0) {
947 continue;
948 }
949
950 const char* z = handle_token(t);
951 if (z == nullptr) {
952 continue;
953 } else if (z == eod) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100954 goto end_of_data;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100955 }
956 return z;
Nigel Tao1b073492020-02-16 22:11:36 +1100957 }
Nigel Tao2cf76db2020-02-27 22:42:01 +1100958
959 if (status.repr == nullptr) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100960 return "main: internal error: unexpected end of token stream";
Nigel Tao2cf76db2020-02-27 22:42:01 +1100961 } else if (status.repr == wuffs_base__suspension__short_read) {
962 if (curr_token_end_src_index != src.meta.ri) {
963 return "main: internal error: inconsistent src indexes";
964 }
965 TRY(read_src());
966 curr_token_end_src_index = src.meta.ri;
967 } else if (status.repr == wuffs_base__suspension__short_write) {
968 tok.compact();
969 } else {
970 return status.message();
Nigel Tao1b073492020-02-16 22:11:36 +1100971 }
972 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100973end_of_data:
974
975 // With a non-empty query, don't try to consume trailing whitespace or
976 // confirm that we've processed all the tokens.
977 if (flags.query_c_string && *flags.query_c_string) {
978 return nullptr;
979 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100980
Nigel Tao6b161af2020-02-24 11:01:48 +1100981 // Consume an optional whitespace trailer. This isn't part of the JSON spec,
982 // but it works better with line oriented Unix tools (such as "echo 123 |
983 // jsonptr" where it's "echo", not "echo -n") or hand-edited JSON files which
984 // can accidentally contain trailing whitespace.
985 //
986 // A whitespace trailer is zero or more ' ' and then zero or one '\n'.
987 while (true) {
988 if (src.meta.ri < src.meta.wi) {
989 uint8_t c = src.data.ptr[src.meta.ri];
990 if (c == ' ') {
991 src.meta.ri++;
992 continue;
993 } else if (c == '\n') {
994 src.meta.ri++;
995 break;
996 }
997 // The "exhausted the input" check below will fail.
998 break;
999 } else if (src.meta.closed) {
1000 break;
1001 }
1002 TRY(read_src());
1003 }
1004
1005 // Check that we've exhausted the input.
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001006 if ((src.meta.ri == src.meta.wi) && !src.meta.closed) {
1007 TRY(read_src());
1008 }
Nigel Tao6b161af2020-02-24 11:01:48 +11001009 if ((src.meta.ri < src.meta.wi) || !src.meta.closed) {
1010 return "main: valid JSON followed by further (unexpected) data";
1011 }
1012
1013 // Check that we've used all of the decoded tokens, other than trailing
1014 // filler tokens. For example, a bare `"foo"` string is valid JSON, but even
1015 // without a trailing '\n', the Wuffs JSON parser emits a filler token for
1016 // the final '\"'.
1017 for (; tok.meta.ri < tok.meta.wi; tok.meta.ri++) {
1018 if (tok.data.ptr[tok.meta.ri].value_base_category() !=
1019 WUFFS_BASE__TOKEN__VBC__FILLER) {
1020 return "main: internal error: decoded OK but unprocessed tokens remain";
1021 }
1022 }
1023
1024 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +11001025}
1026
Nigel Tao2914bae2020-02-26 09:40:30 +11001027int //
1028compute_exit_code(const char* status_msg) {
Nigel Tao9cc2c252020-02-23 17:05:49 +11001029 if (!status_msg) {
1030 return 0;
1031 }
Nigel Tao01abc842020-03-06 21:42:33 +11001032 size_t n;
1033 if (status_msg == usage) {
1034 n = strlen(status_msg);
1035 } else {
Nigel Tao9cc2c252020-02-23 17:05:49 +11001036 n = strnlen(status_msg, 2047);
Nigel Tao01abc842020-03-06 21:42:33 +11001037 if (n >= 2047) {
1038 status_msg = "main: internal error: error message is too long";
1039 n = strnlen(status_msg, 2047);
1040 }
Nigel Tao9cc2c252020-02-23 17:05:49 +11001041 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001042 const int stderr_fd = 2;
1043 ignore_return_value(write(stderr_fd, status_msg, n));
1044 ignore_return_value(write(stderr_fd, "\n", 1));
Nigel Tao9cc2c252020-02-23 17:05:49 +11001045 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
1046 // formatted or unsupported input.
1047 //
1048 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
1049 // run-time checks found that an internal invariant did not hold.
1050 //
1051 // Automated testing, including badly formatted inputs, can therefore
1052 // discriminate between expected failure (exit code 1) and unexpected failure
1053 // (other non-zero exit codes). Specifically, exit code 2 for internal
1054 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
1055 // linux) for a segmentation fault (e.g. null pointer dereference).
1056 return strstr(status_msg, "internal error:") ? 2 : 1;
1057}
1058
Nigel Tao2914bae2020-02-26 09:40:30 +11001059int //
1060main(int argc, char** argv) {
Nigel Tao01abc842020-03-06 21:42:33 +11001061 // Look for an input filename (the first non-flag argument) in argv. If there
1062 // is one, open it (but do not read from it) before we self-impose a sandbox.
1063 //
1064 // Flags start with "-", unless it comes after a bare "--" arg.
1065 {
1066 bool dash_dash = false;
1067 int a;
1068 for (a = 1; a < argc; a++) {
1069 char* arg = argv[a];
1070 if ((arg[0] == '-') && !dash_dash) {
1071 dash_dash = (arg[1] == '-') && (arg[2] == '\x00');
1072 continue;
1073 }
1074 input_file_descriptor = open(arg, O_RDONLY);
1075 if (input_file_descriptor < 0) {
1076 fprintf(stderr, "%s: %s\n", arg, strerror(errno));
1077 return 1;
1078 }
1079 break;
1080 }
1081 }
1082
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001083#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
1084 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
1085 sandboxed = true;
1086#endif
1087
Nigel Tao0cd2f982020-03-03 23:03:02 +11001088 const char* z = main1(argc, argv);
1089 if (wrote_to_dst) {
1090 const char* z1 = write_dst("\n", 1);
1091 const char* z2 = flush_dst();
1092 z = z ? z : (z1 ? z1 : z2);
1093 }
1094 int exit_code = compute_exit_code(z);
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001095
1096#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
1097 // Call SYS_exit explicitly, instead of calling SYS_exit_group implicitly by
1098 // either calling _exit or returning from main. SECCOMP_MODE_STRICT allows
1099 // only SYS_exit.
1100 syscall(SYS_exit, exit_code);
1101#endif
Nigel Tao9cc2c252020-02-23 17:05:49 +11001102 return exit_code;
Nigel Tao1b073492020-02-16 22:11:36 +11001103}