blob: a4148a3b0d325303995ae3e755522271d6ec827b [file] [log] [blame]
Nigel Tao1b073492020-02-16 22:11:36 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
Nigel Tao0cd2f982020-03-03 23:03:02 +110018jsonptr is a JSON formatter (pretty-printer) that supports the JSON Pointer
19(RFC 6901) query syntax. It reads UTF-8 JSON from stdin and writes
20canonicalized, formatted UTF-8 JSON to stdout.
21
22See the "const char* usage" string below for details.
23
24----
25
26JSON Pointer (and this program's implementation) is one of many JSON query
27languages and JSON tools, such as jq, jql and JMESPath. This one is relatively
28simple and fewer-featured compared to those others.
29
30One benefit of simplicity is that this program's JSON and JSON Pointer
31implementations do not dynamically allocate or free memory (yet it does not
32require that the entire input fits in memory at once). They are therefore
33trivially protected against certain bug classes: memory leaks, double-frees and
34use-after-frees.
35
36The core JSON implementation is also written in the Wuffs programming language
37(and then transpiled to C/C++), which is memory-safe but also guards against
38integer arithmetic overflows.
39
Nigel Taofe0cbbd2020-03-05 22:01:30 +110040For defense in depth, on Linux, this program also self-imposes a
41SECCOMP_MODE_STRICT sandbox before reading (or otherwise processing) its input
42or writing its output. Under this sandbox, the only permitted system calls are
43read, write, exit and sigreturn.
44
Nigel Tao0cd2f982020-03-03 23:03:02 +110045All together, this program aims to safely handle untrusted JSON files without
46fear of security bugs such as remote code execution.
47
48----
Nigel Tao1b073492020-02-16 22:11:36 +110049
Nigel Taoc5b3a9e2020-02-24 11:54:35 +110050As of 2020-02-24, this program passes all 318 "test_parsing" cases from the
51JSON test suite (https://github.com/nst/JSONTestSuite), an appendix to the
52"Parsing JSON is a Minefield" article (http://seriot.ch/parsing_json.php) that
53was first published on 2016-10-26 and updated on 2018-03-30.
54
Nigel Tao0cd2f982020-03-03 23:03:02 +110055After modifying this program, run "build-example.sh example/jsonptr/" and then
56"script/run-json-test-suite.sh" to catch correctness regressions.
57
58----
59
Nigel Tao1b073492020-02-16 22:11:36 +110060This example program differs from most other example Wuffs programs in that it
61is written in C++, not C.
62
63$CXX jsonptr.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
64
65for a C++ compiler $CXX, such as clang++ or g++.
66*/
67
Nigel Taofe0cbbd2020-03-05 22:01:30 +110068#include <errno.h>
Nigel Tao01abc842020-03-06 21:42:33 +110069#include <fcntl.h>
70#include <stdio.h>
Nigel Tao9cc2c252020-02-23 17:05:49 +110071#include <string.h>
Nigel Taofe0cbbd2020-03-05 22:01:30 +110072#include <unistd.h>
Nigel Tao1b073492020-02-16 22:11:36 +110073
74// Wuffs ships as a "single file C library" or "header file library" as per
75// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
76//
77// To use that single file as a "foo.c"-like implementation, instead of a
78// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
79// compiling it.
80#define WUFFS_IMPLEMENTATION
81
82// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
83// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
84// the entire Wuffs standard library, implementing a variety of codecs and file
85// formats. Without this macro definition, an optimizing compiler or linker may
86// very well discard Wuffs code for unused codecs, but listing the Wuffs
87// modules we use makes that process explicit. Preprocessing means that such
88// code simply isn't compiled.
89#define WUFFS_CONFIG__MODULES
90#define WUFFS_CONFIG__MODULE__BASE
91#define WUFFS_CONFIG__MODULE__JSON
92
93// If building this program in an environment that doesn't easily accommodate
94// relative includes, you can use the script/inline-c-relative-includes.go
95// program to generate a stand-alone C++ file.
96#include "../../release/c/wuffs-unsupported-snapshot.c"
97
Nigel Taofe0cbbd2020-03-05 22:01:30 +110098#if defined(__linux__)
99#include <linux/prctl.h>
100#include <linux/seccomp.h>
101#include <sys/prctl.h>
102#include <sys/syscall.h>
103#define WUFFS_EXAMPLE_USE_SECCOMP
104#endif
105
Nigel Tao2cf76db2020-02-27 22:42:01 +1100106#define TRY(error_msg) \
107 do { \
108 const char* z = error_msg; \
109 if (z) { \
110 return z; \
111 } \
112 } while (false)
113
114static const char* eod = "main: end of data";
115
Nigel Tao0cd2f982020-03-03 23:03:02 +1100116static const char* usage =
Nigel Tao01abc842020-03-06 21:42:33 +1100117 "Usage: jsonptr -flags input.json\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100118 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100119 "Flags:\n"
Nigel Tao3690e832020-03-12 16:52:26 +1100120 " -c -compact-output\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100121 " -i=NUM -indent=NUM\n"
122 " -o=NUM -max-output-depth=NUM\n"
123 " -q=STR -query=STR\n"
Nigel Taod6fdfb12020-03-11 12:24:14 +1100124 " -s -strict-json-pointer-syntax\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100125 " -t -tabs\n"
126 " -fail-if-unsandboxed\n"
127 "\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100128 "The input.json filename is optional. If absent, it reads from stdin.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100129 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100130 "----\n"
131 "\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100132 "jsonptr is a JSON formatter (pretty-printer) that supports the JSON\n"
133 "Pointer (RFC 6901) query syntax. It reads UTF-8 JSON from stdin and\n"
134 "writes canonicalized, formatted UTF-8 JSON to stdout.\n"
135 "\n"
136 "Canonicalized means that e.g. \"abc\\u000A\\tx\\u0177z\" is re-written\n"
137 "as \"abc\\n\\txŷz\". It does not sort object keys, nor does it reject\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100138 "duplicate keys. Canonicalization does not imply Unicode normalization.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100139 "\n"
140 "Formatted means that arrays' and objects' elements are indented, each\n"
Nigel Tao3690e832020-03-12 16:52:26 +1100141 "on its own line. Configure this with the -c / -compact-output, -i=NUM /\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100142 "-indent=NUM (for NUM ranging from 0 to 8) and -t / -tabs flags.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100143 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100144 "----\n"
145 "\n"
146 "The -q=STR or -query=STR flag gives an optional JSON Pointer query, to\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100147 "print a subset of the input. For example, given RFC 6901 section 5's\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100148 "sample input (https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
149 " jsonptr -query=/foo/1 rfc-6901-json-pointer.json\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100150 "will print:\n"
151 " \"baz\"\n"
152 "\n"
153 "An absent query is equivalent to the empty query, which identifies the\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100154 "entire input (the root value). Unlike a file system, the \"/\" query\n"
155 "does not identify the root. Instead, it identifies the child (the value\n"
156 "in a key-value pair) of the root whose key is the empty string.\n"
157 "Similarly, \"/foo\" and \"/foo/\" identify two different nodes.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100158 "\n"
159 "If the query found a valid JSON value, this program will return a zero\n"
160 "exit code even if the rest of the input isn't valid JSON. If the query\n"
161 "did not find a value, or found an invalid one, this program returns a\n"
162 "non-zero exit code, but may still print partial output to stdout.\n"
163 "\n"
Nigel Tao01abc842020-03-06 21:42:33 +1100164 "The JSON specification (https://json.org/) permits implementations that\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100165 "allow duplicate keys, as this one does. This JSON Pointer implementation\n"
166 "is also greedy, following the first match for each fragment without\n"
167 "back-tracking. For example, the \"/foo/bar\" query will fail if the root\n"
168 "object has multiple \"foo\" children but the first one doesn't have a\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100169 "\"bar\" child, even if later ones do.\n"
170 "\n"
Nigel Taod6fdfb12020-03-11 12:24:14 +1100171 "The -s or -strict-json-pointer-syntax flag restricts the -query=STR\n"
172 "string to exactly RFC 6901, with only two escape sequences: \"~0\" and\n"
173 "\"~1\" for \"~\" and \"/\". Without this flag, this program also lets\n"
174 "\"~n\" and \"~r\" escape the New Line and Carriage Return ASCII control\n"
175 "characters, which can work better with line oriented Unix tools that\n"
176 "assume exactly one value (i.e. one JSON Pointer string) per line.\n"
177 "\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100178 "----\n"
179 "\n"
180 "The -o=NUM or -max-output-depth=NUM flag gives the maximum (inclusive)\n"
181 "output depth. JSON containers ([] arrays and {} objects) can hold other\n"
182 "containers. When this flag is set, containers at depth NUM are replaced\n"
183 "with \"[…]\" or \"{…}\". A bare -o or -max-output-depth is equivalent to\n"
Nigel Taod6fdfb12020-03-11 12:24:14 +1100184 "-o=1. The flag's absence is equivalent to an unlimited output depth.\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100185 "\n"
186 "The -max-output-depth flag only affects the program's output. It doesn't\n"
187 "affect whether or not the input is considered valid JSON. The JSON\n"
188 "specification permits implementations to set their own maximum input\n"
189 "depth. This JSON implementation sets it to 1024.\n"
190 "\n"
191 "Depth is measured in terms of nested containers. It is unaffected by the\n"
192 "number of spaces or tabs used to indent.\n"
193 "\n"
194 "When both -max-output-depth and -query are set, the output depth is\n"
195 "measured from when the query resolves, not from the input root. The\n"
196 "input depth (measured from the root) is still limited to 1024.\n"
197 "\n"
198 "----\n"
199 "\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100200 "The -fail-if-unsandboxed flag causes the program to exit if it does not\n"
201 "self-impose a sandbox. On Linux, it self-imposes a SECCOMP_MODE_STRICT\n"
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100202 "sandbox, regardless of whether this flag was set.";
Nigel Tao0cd2f982020-03-03 23:03:02 +1100203
Nigel Tao2cf76db2020-02-27 22:42:01 +1100204// ----
205
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100206bool sandboxed = false;
207
Nigel Tao01abc842020-03-06 21:42:33 +1100208int input_file_descriptor = 0; // A 0 default means stdin.
209
Nigel Tao2cf76db2020-02-27 22:42:01 +1100210#define MAX_INDENT 8
Nigel Tao107f0ef2020-03-01 21:35:02 +1100211#define INDENT_SPACES_STRING " "
Nigel Tao6e7d1412020-03-06 09:21:35 +1100212#define INDENT_TAB_STRING "\t"
Nigel Tao107f0ef2020-03-01 21:35:02 +1100213
Nigel Taofdac24a2020-03-06 21:53:08 +1100214#ifndef DST_BUFFER_ARRAY_SIZE
215#define DST_BUFFER_ARRAY_SIZE (32 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100216#endif
Nigel Taofdac24a2020-03-06 21:53:08 +1100217#ifndef SRC_BUFFER_ARRAY_SIZE
218#define SRC_BUFFER_ARRAY_SIZE (32 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100219#endif
Nigel Taofdac24a2020-03-06 21:53:08 +1100220#ifndef TOKEN_BUFFER_ARRAY_SIZE
221#define TOKEN_BUFFER_ARRAY_SIZE (4 * 1024)
Nigel Tao1b073492020-02-16 22:11:36 +1100222#endif
223
Nigel Taofdac24a2020-03-06 21:53:08 +1100224uint8_t dst_array[DST_BUFFER_ARRAY_SIZE];
225uint8_t src_array[SRC_BUFFER_ARRAY_SIZE];
226wuffs_base__token tok_array[TOKEN_BUFFER_ARRAY_SIZE];
Nigel Tao1b073492020-02-16 22:11:36 +1100227
228wuffs_base__io_buffer dst;
229wuffs_base__io_buffer src;
230wuffs_base__token_buffer tok;
231
Nigel Tao2cf76db2020-02-27 22:42:01 +1100232// curr_token_end_src_index is the src.data.ptr index of the end of the current
233// token. An invariant is that (curr_token_end_src_index <= src.meta.ri).
234size_t curr_token_end_src_index;
235
Nigel Tao0cd2f982020-03-03 23:03:02 +1100236uint32_t depth;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100237
238enum class context {
239 none,
240 in_list_after_bracket,
241 in_list_after_value,
242 in_dict_after_brace,
243 in_dict_after_key,
244 in_dict_after_value,
245} ctx;
246
Nigel Tao0cd2f982020-03-03 23:03:02 +1100247bool //
248in_dict_before_key() {
249 return (ctx == context::in_dict_after_brace) ||
250 (ctx == context::in_dict_after_value);
251}
252
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100253uint32_t suppress_write_dst;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100254bool wrote_to_dst;
255
Nigel Tao1b073492020-02-16 22:11:36 +1100256wuffs_json__decoder dec;
Nigel Tao1b073492020-02-16 22:11:36 +1100257
Nigel Tao0cd2f982020-03-03 23:03:02 +1100258// ----
259
260// Query is a JSON Pointer query. After initializing with a NUL-terminated C
261// string, its multiple fragments are consumed as the program walks the JSON
262// data from stdin. For example, letting "$" denote a NUL, suppose that we
263// started with a query string of "/apple/banana/12/durian" and are currently
264// trying to match the second fragment, "banana", so that Query::depth is 2:
265//
266// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
267// / a p p l e / b a n a n a / 1 2 / d u r i a n $
268// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
269// ^ ^
270// frag_i frag_k
271//
272// The two pointers frag_i and frag_k are the start (inclusive) and end
273// (exclusive) of the fragment. They satisfy (frag_i <= frag_k) and may be
274// equal if the fragment empty (note that "" is a valid JSON object key).
275//
276// The frag_j pointer moves between these two, or is nullptr. An invariant is
277// that (((frag_i <= frag_j) && (frag_j <= frag_k)) || (frag_j == nullptr)).
278//
279// Wuffs' JSON tokenizer can portray a single JSON string as multiple Wuffs
280// tokens, as backslash-escaped values within that JSON string may each get
281// their own token.
282//
283// At the start of each object key (a JSON string), frag_j is set to frag_i.
284//
285// While frag_j remains non-nullptr, each token's unescaped contents are then
286// compared to that part of the fragment from frag_j to frag_k. If it is a
287// prefix (including the case of an exact match), then frag_j is advanced by
288// the unescaped length. Otherwise, frag_j is set to nullptr.
289//
290// Comparison accounts for JSON Pointer's escaping notation: "~0" and "~1" in
291// the query (not the JSON value) are unescaped to "~" and "/" respectively.
292//
293// The frag_j pointer therefore advances from frag_i to frag_k, or drops out,
294// as we incrementally match the object key with the query fragment. For
295// example, if we have already matched the "ban" of "banana", then we would
296// accept any of an "ana" token, an "a" token or a "\u0061" token, amongst
297// others. They would advance frag_j by 3, 1 or 1 bytes respectively.
298//
299// frag_j
300// v
301// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
302// / a p p l e / b a n a n a / 1 2 / d u r i a n $
303// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
304// ^ ^
305// frag_i frag_k
306//
307// At the end of each object key (or equivalently, at the start of each object
308// value), if frag_j is non-nullptr and equal to (but not less than) frag_k
309// then we have a fragment match: the query fragment equals the object key. If
310// there is a next fragment (in this example, "12") we move the frag_etc
311// pointers to its start and end and increment Query::depth. Otherwise, we have
312// matched the complete query, and the upcoming JSON value is the result of
313// that query.
314//
315// The discussion above centers on object keys. If the query fragment is
316// numeric then it can also match as an array index: the string fragment "12"
317// will match an array's 13th element (starting counting from zero). See RFC
318// 6901 for its precise definition of an "array index" number.
319//
320// Array index fragment match is represented by the Query::array_index field,
321// whose type (wuffs_base__result_u64) is a result type. An error result means
322// that the fragment is not an array index. A value result holds the number of
323// list elements remaining. When matching a query fragment in an array (instead
324// of in an object), each element ticks this number down towards zero. At zero,
325// the upcoming JSON value is the one that matches the query fragment.
326class Query {
327 private:
328 uint8_t* frag_i;
329 uint8_t* frag_j;
330 uint8_t* frag_k;
331
332 uint32_t depth;
333
334 wuffs_base__result_u64 array_index;
335
336 public:
337 void reset(char* query_c_string) {
338 this->frag_i = (uint8_t*)query_c_string;
339 this->frag_j = (uint8_t*)query_c_string;
340 this->frag_k = (uint8_t*)query_c_string;
341 this->depth = 0;
342 this->array_index.status.repr = "#main: not an array index query fragment";
343 this->array_index.value = 0;
344 }
345
346 void restart_fragment(bool enable) {
347 this->frag_j = enable ? this->frag_i : nullptr;
348 }
349
350 bool is_at(uint32_t depth) { return this->depth == depth; }
351
352 // tick returns whether the fragment is a valid array index whose value is
353 // zero. If valid but non-zero, it decrements it and returns false.
354 bool tick() {
355 if (this->array_index.status.is_ok()) {
356 if (this->array_index.value == 0) {
357 return true;
358 }
359 this->array_index.value--;
360 }
361 return false;
362 }
363
364 // next_fragment moves to the next fragment, returning whether it existed.
365 bool next_fragment() {
366 uint8_t* k = this->frag_k;
367 uint32_t d = this->depth;
368
369 this->reset(nullptr);
370
371 if (!k || (*k != '/')) {
372 return false;
373 }
374 k++;
375
376 bool all_digits = true;
377 uint8_t* i = k;
378 while ((*k != '\x00') && (*k != '/')) {
379 all_digits = all_digits && ('0' <= *k) && (*k <= '9');
380 k++;
381 }
382 this->frag_i = i;
383 this->frag_j = i;
384 this->frag_k = k;
385 this->depth = d + 1;
386 if (all_digits) {
387 // wuffs_base__parse_number_u64 rejects leading zeroes, e.g. "00", "07".
388 this->array_index =
389 wuffs_base__parse_number_u64(wuffs_base__make_slice_u8(i, k - i));
390 }
391 return true;
392 }
393
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100394 bool matched_all() { return this->frag_k == nullptr; }
395
396 bool matched_fragment() {
397 return this->frag_j && (this->frag_j == this->frag_k);
398 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100399
400 void incremental_match_slice(uint8_t* ptr, size_t len) {
401 if (!this->frag_j) {
402 return;
403 }
404 uint8_t* j = this->frag_j;
405 while (true) {
406 if (len == 0) {
407 this->frag_j = j;
408 return;
409 }
410
411 if (*j == '\x00') {
412 break;
413
414 } else if (*j == '~') {
415 j++;
416 if (*j == '0') {
417 if (*ptr != '~') {
418 break;
419 }
420 } else if (*j == '1') {
421 if (*ptr != '/') {
422 break;
423 }
Nigel Taod6fdfb12020-03-11 12:24:14 +1100424 } else if (*j == 'n') {
425 if (*ptr != '\n') {
426 break;
427 }
428 } else if (*j == 'r') {
429 if (*ptr != '\r') {
430 break;
431 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100432 } else {
433 break;
434 }
435
436 } else if (*j != *ptr) {
437 break;
438 }
439
440 j++;
441 ptr++;
442 len--;
443 }
444 this->frag_j = nullptr;
445 }
446
447 void incremental_match_code_point(uint32_t code_point) {
448 if (!this->frag_j) {
449 return;
450 }
451 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
452 size_t n = wuffs_base__utf_8__encode(
453 wuffs_base__make_slice_u8(&u[0],
454 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
455 code_point);
456 if (n > 0) {
457 this->incremental_match_slice(&u[0], n);
458 }
459 }
460
461 // validate returns whether the (ptr, len) arguments form a valid JSON
462 // Pointer. In particular, it must be valid UTF-8, and either be empty or
463 // start with a '/'. Any '~' within must immediately be followed by either
Nigel Taod6fdfb12020-03-11 12:24:14 +1100464 // '0' or '1'. If strict_json_pointer_syntax is false, a '~' may also be
465 // followed by either 'n' or 'r'.
466 static bool validate(char* query_c_string,
467 size_t length,
468 bool strict_json_pointer_syntax) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100469 if (length <= 0) {
470 return true;
471 }
472 if (query_c_string[0] != '/') {
473 return false;
474 }
475 wuffs_base__slice_u8 s =
476 wuffs_base__make_slice_u8((uint8_t*)query_c_string, length);
477 bool previous_was_tilde = false;
478 while (s.len > 0) {
479 wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);
480 if (!o.is_valid()) {
481 return false;
482 }
Nigel Taod6fdfb12020-03-11 12:24:14 +1100483
484 if (previous_was_tilde) {
485 switch (o.code_point) {
486 case '0':
487 case '1':
488 break;
489 case 'n':
490 case 'r':
491 if (strict_json_pointer_syntax) {
492 return false;
493 }
494 break;
495 default:
496 return false;
497 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100498 }
499 previous_was_tilde = o.code_point == '~';
Nigel Taod6fdfb12020-03-11 12:24:14 +1100500
Nigel Tao0cd2f982020-03-03 23:03:02 +1100501 s.ptr += o.byte_length;
502 s.len -= o.byte_length;
503 }
504 return !previous_was_tilde;
505 }
506} query;
507
508// ----
509
Nigel Tao68920952020-03-03 11:25:18 +1100510struct {
511 int remaining_argc;
512 char** remaining_argv;
513
Nigel Tao3690e832020-03-12 16:52:26 +1100514 bool compact_output;
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100515 bool fail_if_unsandboxed;
Nigel Tao68920952020-03-03 11:25:18 +1100516 size_t indent;
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100517 uint32_t max_output_depth;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100518 char* query_c_string;
Nigel Taod6fdfb12020-03-11 12:24:14 +1100519 bool strict_json_pointer_syntax;
Nigel Tao68920952020-03-03 11:25:18 +1100520 bool tabs;
521} flags = {0};
522
523const char* //
524parse_flags(int argc, char** argv) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100525 flags.indent = 4;
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100526 flags.max_output_depth = 0xFFFFFFFF;
Nigel Tao68920952020-03-03 11:25:18 +1100527
528 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
529 for (; c < argc; c++) {
530 char* arg = argv[c];
531 if (*arg++ != '-') {
532 break;
533 }
534
535 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
536 // cases, a bare "-" is not a flag (some programs may interpret it as
537 // stdin) and a bare "--" means to stop parsing flags.
538 if (*arg == '\x00') {
539 break;
540 } else if (*arg == '-') {
541 arg++;
542 if (*arg == '\x00') {
543 c++;
544 break;
545 }
546 }
547
Nigel Tao3690e832020-03-12 16:52:26 +1100548 if (!strcmp(arg, "c") || !strcmp(arg, "compact-output")) {
549 flags.compact_output = true;
Nigel Tao68920952020-03-03 11:25:18 +1100550 continue;
551 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100552 if (!strcmp(arg, "fail-if-unsandboxed")) {
553 flags.fail_if_unsandboxed = true;
554 continue;
555 }
Nigel Tao68920952020-03-03 11:25:18 +1100556 if (!strncmp(arg, "i=", 2) || !strncmp(arg, "indent=", 7)) {
557 while (*arg++ != '=') {
558 }
559 if (('0' <= arg[0]) && (arg[0] <= '8') && (arg[1] == '\x00')) {
560 flags.indent = arg[0] - '0';
Nigel Tao68920952020-03-03 11:25:18 +1100561 continue;
562 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100563 return usage;
564 }
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100565 if (!strcmp(arg, "o") || !strcmp(arg, "max-output-depth")) {
566 flags.max_output_depth = 1;
567 continue;
568 } else if (!strncmp(arg, "o=", 2) ||
569 !strncmp(arg, "max-output-depth=", 16)) {
570 while (*arg++ != '=') {
571 }
572 wuffs_base__result_u64 u = wuffs_base__parse_number_u64(
573 wuffs_base__make_slice_u8((uint8_t*)arg, strlen(arg)));
574 if (wuffs_base__status__is_ok(&u.status) && (u.value <= 0xFFFFFFFF)) {
575 flags.max_output_depth = (uint32_t)(u.value);
576 continue;
577 }
578 return usage;
579 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100580 if (!strncmp(arg, "q=", 2) || !strncmp(arg, "query=", 6)) {
581 while (*arg++ != '=') {
582 }
Nigel Taod6fdfb12020-03-11 12:24:14 +1100583 flags.query_c_string = arg;
584 continue;
585 }
586 if (!strcmp(arg, "s") || !strcmp(arg, "strict-json-pointer-syntax")) {
587 flags.strict_json_pointer_syntax = true;
588 continue;
Nigel Tao68920952020-03-03 11:25:18 +1100589 }
590 if (!strcmp(arg, "t") || !strcmp(arg, "tabs")) {
591 flags.tabs = true;
592 continue;
593 }
594
Nigel Tao0cd2f982020-03-03 23:03:02 +1100595 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100596 }
597
Nigel Taod6fdfb12020-03-11 12:24:14 +1100598 if (flags.query_c_string &&
599 !Query::validate(flags.query_c_string, strlen(flags.query_c_string),
600 flags.strict_json_pointer_syntax)) {
601 return "main: bad JSON Pointer (RFC 6901) syntax for the -query=STR flag";
602 }
603
Nigel Tao68920952020-03-03 11:25:18 +1100604 flags.remaining_argc = argc - c;
605 flags.remaining_argv = argv + c;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100606 return nullptr;
Nigel Tao68920952020-03-03 11:25:18 +1100607}
608
Nigel Tao2cf76db2020-02-27 22:42:01 +1100609const char* //
610initialize_globals(int argc, char** argv) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100611 dst = wuffs_base__make_io_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100612 wuffs_base__make_slice_u8(dst_array, DST_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100613 wuffs_base__empty_io_buffer_meta());
Nigel Tao1b073492020-02-16 22:11:36 +1100614
Nigel Tao2cf76db2020-02-27 22:42:01 +1100615 src = wuffs_base__make_io_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100616 wuffs_base__make_slice_u8(src_array, SRC_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100617 wuffs_base__empty_io_buffer_meta());
618
619 tok = wuffs_base__make_token_buffer(
Nigel Taofdac24a2020-03-06 21:53:08 +1100620 wuffs_base__make_slice_token(tok_array, TOKEN_BUFFER_ARRAY_SIZE),
Nigel Tao2cf76db2020-02-27 22:42:01 +1100621 wuffs_base__empty_token_buffer_meta());
622
623 curr_token_end_src_index = 0;
624
Nigel Tao2cf76db2020-02-27 22:42:01 +1100625 depth = 0;
626
627 ctx = context::none;
628
Nigel Tao68920952020-03-03 11:25:18 +1100629 TRY(parse_flags(argc, argv));
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100630 if (flags.fail_if_unsandboxed && !sandboxed) {
631 return "main: unsandboxed";
632 }
Nigel Tao01abc842020-03-06 21:42:33 +1100633 const int stdin_fd = 0;
634 if (flags.remaining_argc > ((input_file_descriptor != stdin_fd) ? 1 : 0)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100635 return usage;
Nigel Tao107f0ef2020-03-01 21:35:02 +1100636 }
637
Nigel Tao0cd2f982020-03-03 23:03:02 +1100638 query.reset(flags.query_c_string);
639
640 // If the query is non-empty, suprress writing to stdout until we've
641 // completed the query.
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100642 suppress_write_dst = query.next_fragment() ? 1 : 0;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100643 wrote_to_dst = false;
644
Nigel Tao2cf76db2020-02-27 22:42:01 +1100645 return dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0)
646 .message();
647}
Nigel Tao1b073492020-02-16 22:11:36 +1100648
649// ----
650
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100651// ignore_return_value suppresses errors from -Wall -Werror.
652static void //
653ignore_return_value(int ignored) {}
654
Nigel Tao2914bae2020-02-26 09:40:30 +1100655const char* //
656read_src() {
Nigel Taoa8406922020-02-19 12:22:00 +1100657 if (src.meta.closed) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100658 return "main: internal error: read requested on a closed source";
Nigel Taoa8406922020-02-19 12:22:00 +1100659 }
Nigel Tao1b073492020-02-16 22:11:36 +1100660 src.compact();
661 if (src.meta.wi >= src.data.len) {
662 return "main: src buffer is full";
663 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100664 while (true) {
Nigel Tao01abc842020-03-06 21:42:33 +1100665 ssize_t n = read(input_file_descriptor, src.data.ptr + src.meta.wi,
666 src.data.len - src.meta.wi);
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100667 if (n >= 0) {
668 src.meta.wi += n;
669 src.meta.closed = n == 0;
670 break;
671 } else if (errno != EINTR) {
672 return strerror(errno);
673 }
Nigel Tao1b073492020-02-16 22:11:36 +1100674 }
675 return nullptr;
676}
677
Nigel Tao2914bae2020-02-26 09:40:30 +1100678const char* //
679flush_dst() {
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100680 while (true) {
681 size_t n = dst.meta.wi - dst.meta.ri;
682 if (n == 0) {
683 break;
Nigel Tao1b073492020-02-16 22:11:36 +1100684 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100685 const int stdout_fd = 1;
686 ssize_t i = write(stdout_fd, dst.data.ptr + dst.meta.ri, n);
687 if (i >= 0) {
688 dst.meta.ri += i;
689 } else if (errno != EINTR) {
690 return strerror(errno);
691 }
Nigel Tao1b073492020-02-16 22:11:36 +1100692 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100693 dst.compact();
Nigel Tao1b073492020-02-16 22:11:36 +1100694 return nullptr;
695}
696
Nigel Tao2914bae2020-02-26 09:40:30 +1100697const char* //
698write_dst(const void* s, size_t n) {
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100699 if (suppress_write_dst > 0) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100700 return nullptr;
701 }
Nigel Tao1b073492020-02-16 22:11:36 +1100702 const uint8_t* p = static_cast<const uint8_t*>(s);
703 while (n > 0) {
704 size_t i = dst.writer_available();
705 if (i == 0) {
706 const char* z = flush_dst();
707 if (z) {
708 return z;
709 }
710 i = dst.writer_available();
711 if (i == 0) {
712 return "main: dst buffer is full";
713 }
714 }
715
716 if (i > n) {
717 i = n;
718 }
719 memcpy(dst.data.ptr + dst.meta.wi, p, i);
720 dst.meta.wi += i;
721 p += i;
722 n -= i;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100723 wrote_to_dst = true;
Nigel Tao1b073492020-02-16 22:11:36 +1100724 }
725 return nullptr;
726}
727
728// ----
729
Nigel Tao2914bae2020-02-26 09:40:30 +1100730uint8_t //
731hex_digit(uint8_t nibble) {
Nigel Taob5461bd2020-02-21 14:13:37 +1100732 nibble &= 0x0F;
733 if (nibble <= 9) {
734 return '0' + nibble;
735 }
736 return ('A' - 10) + nibble;
737}
738
Nigel Tao2914bae2020-02-26 09:40:30 +1100739const char* //
Nigel Tao3b486982020-02-27 15:05:59 +1100740handle_unicode_code_point(uint32_t ucp) {
741 if (ucp < 0x0020) {
742 switch (ucp) {
743 case '\b':
744 return write_dst("\\b", 2);
745 case '\f':
746 return write_dst("\\f", 2);
747 case '\n':
748 return write_dst("\\n", 2);
749 case '\r':
750 return write_dst("\\r", 2);
751 case '\t':
752 return write_dst("\\t", 2);
753 default: {
754 // Other bytes less than 0x0020 are valid UTF-8 but not valid in a
755 // JSON string. They need to remain escaped.
756 uint8_t esc6[6];
757 esc6[0] = '\\';
758 esc6[1] = 'u';
759 esc6[2] = '0';
760 esc6[3] = '0';
761 esc6[4] = hex_digit(ucp >> 4);
762 esc6[5] = hex_digit(ucp >> 0);
763 return write_dst(&esc6[0], 6);
764 }
765 }
766
Nigel Taob9ad34f2020-03-03 12:44:01 +1100767 } else if (ucp == '\"') {
768 return write_dst("\\\"", 2);
769
770 } else if (ucp == '\\') {
771 return write_dst("\\\\", 2);
772
773 } else {
774 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
775 size_t n = wuffs_base__utf_8__encode(
776 wuffs_base__make_slice_u8(&u[0],
777 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
778 ucp);
779 if (n > 0) {
780 return write_dst(&u[0], n);
Nigel Tao3b486982020-02-27 15:05:59 +1100781 }
Nigel Tao3b486982020-02-27 15:05:59 +1100782 }
783
Nigel Tao2cf76db2020-02-27 22:42:01 +1100784 return "main: internal error: unexpected Unicode code point";
Nigel Tao3b486982020-02-27 15:05:59 +1100785}
786
787const char* //
Nigel Tao2cf76db2020-02-27 22:42:01 +1100788handle_token(wuffs_base__token t) {
789 do {
790 uint64_t vbc = t.value_base_category();
791 uint64_t vbd = t.value_base_detail();
792 uint64_t len = t.length();
Nigel Tao1b073492020-02-16 22:11:36 +1100793
794 // Handle ']' or '}'.
Nigel Tao9f7a2502020-02-23 09:42:02 +1100795 if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
Nigel Tao2cf76db2020-02-27 22:42:01 +1100796 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100797 if (query.is_at(depth)) {
798 return "main: no match for query";
799 }
Nigel Tao1b073492020-02-16 22:11:36 +1100800 if (depth <= 0) {
801 return "main: internal error: inconsistent depth";
802 }
803 depth--;
804
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100805 if (query.matched_all() && (depth >= flags.max_output_depth)) {
806 suppress_write_dst--;
807 // '…' is U+2026 HORIZONTAL ELLIPSIS, which is 3 UTF-8 bytes.
808 TRY(write_dst((vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST)
809 ? "\"[…]\""
810 : "\"{…}\"",
811 7));
812 } else {
813 // Write preceding whitespace.
814 if ((ctx != context::in_list_after_bracket) &&
Nigel Tao3690e832020-03-12 16:52:26 +1100815 (ctx != context::in_dict_after_brace) && !flags.compact_output) {
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100816 TRY(write_dst("\n", 1));
817 for (uint32_t i = 0; i < depth; i++) {
818 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
819 flags.tabs ? 1 : flags.indent));
820 }
Nigel Tao1b073492020-02-16 22:11:36 +1100821 }
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100822
823 TRY(write_dst(
824 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST) ? "]" : "}",
825 1));
Nigel Tao1b073492020-02-16 22:11:36 +1100826 }
827
Nigel Tao9f7a2502020-02-23 09:42:02 +1100828 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
829 ? context::in_list_after_value
830 : context::in_dict_after_key;
Nigel Tao1b073492020-02-16 22:11:36 +1100831 goto after_value;
832 }
833
Nigel Taod1c928a2020-02-28 12:43:53 +1100834 // Write preceding whitespace and punctuation, if it wasn't ']', '}' or a
835 // continuation of a multi-token chain.
Nigel Tao0cd2f982020-03-03 23:03:02 +1100836 if (!t.link_prev()) {
837 if (ctx == context::in_dict_after_key) {
Nigel Tao3690e832020-03-12 16:52:26 +1100838 TRY(write_dst(": ", flags.compact_output ? 1 : 2));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100839 } else if (ctx != context::none) {
840 if ((ctx != context::in_list_after_bracket) &&
841 (ctx != context::in_dict_after_brace)) {
842 TRY(write_dst(",", 1));
Nigel Tao107f0ef2020-03-01 21:35:02 +1100843 }
Nigel Tao3690e832020-03-12 16:52:26 +1100844 if (!flags.compact_output) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100845 TRY(write_dst("\n", 1));
846 for (size_t i = 0; i < depth; i++) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100847 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
848 flags.tabs ? 1 : flags.indent));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100849 }
850 }
851 }
852
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100853 bool query_matched_fragment = false;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100854 if (query.is_at(depth)) {
855 switch (ctx) {
856 case context::in_list_after_bracket:
857 case context::in_list_after_value:
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100858 query_matched_fragment = query.tick();
Nigel Tao0cd2f982020-03-03 23:03:02 +1100859 break;
860 case context::in_dict_after_key:
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100861 query_matched_fragment = query.matched_fragment();
Nigel Tao0cd2f982020-03-03 23:03:02 +1100862 break;
863 }
864 }
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100865 if (!query_matched_fragment) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100866 // No-op.
867 } else if (!query.next_fragment()) {
868 // There is no next fragment. We have matched the complete query, and
869 // the upcoming JSON value is the result of that query.
870 //
871 // Un-suppress writing to stdout and reset the ctx and depth as if we
872 // were about to decode a top-level value. This makes any subsequent
873 // indentation be relative to this point, and we will return eod after
874 // the upcoming JSON value is complete.
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100875 if (suppress_write_dst != 1) {
876 return "main: internal error: inconsistent suppress_write_dst";
877 }
878 suppress_write_dst = 0;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100879 ctx = context::none;
880 depth = 0;
881 } else if ((vbc != WUFFS_BASE__TOKEN__VBC__STRUCTURE) ||
882 !(vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH)) {
883 // The query has moved on to the next fragment but the upcoming JSON
884 // value is not a container.
885 return "main: no match for query";
Nigel Tao1b073492020-02-16 22:11:36 +1100886 }
887 }
888
889 // Handle the token itself: either a container ('[' or '{') or a simple
Nigel Tao85fba7f2020-02-29 16:28:06 +1100890 // value: string (a chain of raw or escaped parts), literal or number.
Nigel Tao1b073492020-02-16 22:11:36 +1100891 switch (vbc) {
Nigel Tao85fba7f2020-02-29 16:28:06 +1100892 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
Nigel Tao52c4d6a2020-03-08 21:12:38 +1100893 if (query.matched_all() && (depth >= flags.max_output_depth)) {
894 suppress_write_dst++;
895 } else {
896 TRY(write_dst(
897 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) ? "[" : "{",
898 1));
899 }
Nigel Tao85fba7f2020-02-29 16:28:06 +1100900 depth++;
901 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
902 ? context::in_list_after_bracket
903 : context::in_dict_after_brace;
904 return nullptr;
905
Nigel Tao2cf76db2020-02-27 22:42:01 +1100906 case WUFFS_BASE__TOKEN__VBC__STRING:
Nigel Taod1c928a2020-02-28 12:43:53 +1100907 if (!t.link_prev()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100908 TRY(write_dst("\"", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100909 query.restart_fragment(in_dict_before_key() && query.is_at(depth));
Nigel Tao2cf76db2020-02-27 22:42:01 +1100910 }
Nigel Taocb37a562020-02-28 09:56:24 +1100911
912 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
913 // No-op.
914 } else if (vbd &
915 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100916 uint8_t* ptr = src.data.ptr + curr_token_end_src_index - len;
917 TRY(write_dst(ptr, len));
918 query.incremental_match_slice(ptr, len);
Nigel Taocb37a562020-02-28 09:56:24 +1100919 } else {
920 return "main: internal error: unexpected string-token conversion";
921 }
922
Nigel Taod1c928a2020-02-28 12:43:53 +1100923 if (t.link_next()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100924 return nullptr;
925 }
926 TRY(write_dst("\"", 1));
927 goto after_value;
928
929 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT:
Nigel Tao0cd2f982020-03-03 23:03:02 +1100930 if (!t.link_prev() || !t.link_next()) {
931 return "main: internal error: unexpected unlinked token";
932 }
933 TRY(handle_unicode_code_point(vbd));
934 query.incremental_match_code_point(vbd);
935 return nullptr;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100936
Nigel Tao85fba7f2020-02-29 16:28:06 +1100937 case WUFFS_BASE__TOKEN__VBC__LITERAL:
Nigel Tao2cf76db2020-02-27 22:42:01 +1100938 case WUFFS_BASE__TOKEN__VBC__NUMBER:
939 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
940 goto after_value;
Nigel Tao1b073492020-02-16 22:11:36 +1100941 }
942
943 // Return an error if we didn't match the (vbc, vbd) pair.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100944 return "main: internal error: unexpected token";
945 } while (0);
Nigel Tao1b073492020-02-16 22:11:36 +1100946
Nigel Tao2cf76db2020-02-27 22:42:01 +1100947 // Book-keeping after completing a value (whether a container value or a
948 // simple value). Empty parent containers are no longer empty. If the parent
949 // container is a "{...}" object, toggle between keys and values.
950after_value:
951 if (depth == 0) {
952 return eod;
953 }
954 switch (ctx) {
955 case context::in_list_after_bracket:
956 ctx = context::in_list_after_value;
957 break;
958 case context::in_dict_after_brace:
959 ctx = context::in_dict_after_key;
960 break;
961 case context::in_dict_after_key:
962 ctx = context::in_dict_after_value;
963 break;
964 case context::in_dict_after_value:
965 ctx = context::in_dict_after_key;
966 break;
967 }
968 return nullptr;
969}
970
971const char* //
972main1(int argc, char** argv) {
973 TRY(initialize_globals(argc, argv));
974
975 while (true) {
976 wuffs_base__status status = dec.decode_tokens(&tok, &src);
977
978 while (tok.meta.ri < tok.meta.wi) {
979 wuffs_base__token t = tok.data.ptr[tok.meta.ri++];
980 uint64_t n = t.length();
981 if ((src.meta.ri - curr_token_end_src_index) < n) {
982 return "main: internal error: inconsistent src indexes";
983 }
984 curr_token_end_src_index += n;
985
986 if (t.value() == 0) {
987 continue;
988 }
989
990 const char* z = handle_token(t);
991 if (z == nullptr) {
992 continue;
993 } else if (z == eod) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100994 goto end_of_data;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100995 }
996 return z;
Nigel Tao1b073492020-02-16 22:11:36 +1100997 }
Nigel Tao2cf76db2020-02-27 22:42:01 +1100998
999 if (status.repr == nullptr) {
Nigel Tao0cd2f982020-03-03 23:03:02 +11001000 return "main: internal error: unexpected end of token stream";
Nigel Tao2cf76db2020-02-27 22:42:01 +11001001 } else if (status.repr == wuffs_base__suspension__short_read) {
1002 if (curr_token_end_src_index != src.meta.ri) {
1003 return "main: internal error: inconsistent src indexes";
1004 }
1005 TRY(read_src());
1006 curr_token_end_src_index = src.meta.ri;
1007 } else if (status.repr == wuffs_base__suspension__short_write) {
1008 tok.compact();
1009 } else {
1010 return status.message();
Nigel Tao1b073492020-02-16 22:11:36 +11001011 }
1012 }
Nigel Tao0cd2f982020-03-03 23:03:02 +11001013end_of_data:
1014
1015 // With a non-empty query, don't try to consume trailing whitespace or
1016 // confirm that we've processed all the tokens.
1017 if (flags.query_c_string && *flags.query_c_string) {
1018 return nullptr;
1019 }
Nigel Tao6b161af2020-02-24 11:01:48 +11001020
Nigel Tao6b161af2020-02-24 11:01:48 +11001021 // Consume an optional whitespace trailer. This isn't part of the JSON spec,
1022 // but it works better with line oriented Unix tools (such as "echo 123 |
1023 // jsonptr" where it's "echo", not "echo -n") or hand-edited JSON files which
1024 // can accidentally contain trailing whitespace.
1025 //
1026 // A whitespace trailer is zero or more ' ' and then zero or one '\n'.
1027 while (true) {
1028 if (src.meta.ri < src.meta.wi) {
1029 uint8_t c = src.data.ptr[src.meta.ri];
1030 if (c == ' ') {
1031 src.meta.ri++;
1032 continue;
1033 } else if (c == '\n') {
1034 src.meta.ri++;
1035 break;
1036 }
1037 // The "exhausted the input" check below will fail.
1038 break;
1039 } else if (src.meta.closed) {
1040 break;
1041 }
1042 TRY(read_src());
1043 }
1044
1045 // Check that we've exhausted the input.
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001046 if ((src.meta.ri == src.meta.wi) && !src.meta.closed) {
1047 TRY(read_src());
1048 }
Nigel Tao6b161af2020-02-24 11:01:48 +11001049 if ((src.meta.ri < src.meta.wi) || !src.meta.closed) {
1050 return "main: valid JSON followed by further (unexpected) data";
1051 }
1052
1053 // Check that we've used all of the decoded tokens, other than trailing
1054 // filler tokens. For example, a bare `"foo"` string is valid JSON, but even
1055 // without a trailing '\n', the Wuffs JSON parser emits a filler token for
1056 // the final '\"'.
1057 for (; tok.meta.ri < tok.meta.wi; tok.meta.ri++) {
1058 if (tok.data.ptr[tok.meta.ri].value_base_category() !=
1059 WUFFS_BASE__TOKEN__VBC__FILLER) {
1060 return "main: internal error: decoded OK but unprocessed tokens remain";
1061 }
1062 }
1063
1064 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +11001065}
1066
Nigel Tao2914bae2020-02-26 09:40:30 +11001067int //
1068compute_exit_code(const char* status_msg) {
Nigel Tao9cc2c252020-02-23 17:05:49 +11001069 if (!status_msg) {
1070 return 0;
1071 }
Nigel Tao01abc842020-03-06 21:42:33 +11001072 size_t n;
1073 if (status_msg == usage) {
1074 n = strlen(status_msg);
1075 } else {
Nigel Tao9cc2c252020-02-23 17:05:49 +11001076 n = strnlen(status_msg, 2047);
Nigel Tao01abc842020-03-06 21:42:33 +11001077 if (n >= 2047) {
1078 status_msg = "main: internal error: error message is too long";
1079 n = strnlen(status_msg, 2047);
1080 }
Nigel Tao9cc2c252020-02-23 17:05:49 +11001081 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001082 const int stderr_fd = 2;
1083 ignore_return_value(write(stderr_fd, status_msg, n));
1084 ignore_return_value(write(stderr_fd, "\n", 1));
Nigel Tao9cc2c252020-02-23 17:05:49 +11001085 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
1086 // formatted or unsupported input.
1087 //
1088 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
1089 // run-time checks found that an internal invariant did not hold.
1090 //
1091 // Automated testing, including badly formatted inputs, can therefore
1092 // discriminate between expected failure (exit code 1) and unexpected failure
1093 // (other non-zero exit codes). Specifically, exit code 2 for internal
1094 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
1095 // linux) for a segmentation fault (e.g. null pointer dereference).
1096 return strstr(status_msg, "internal error:") ? 2 : 1;
1097}
1098
Nigel Tao2914bae2020-02-26 09:40:30 +11001099int //
1100main(int argc, char** argv) {
Nigel Tao01abc842020-03-06 21:42:33 +11001101 // Look for an input filename (the first non-flag argument) in argv. If there
1102 // is one, open it (but do not read from it) before we self-impose a sandbox.
1103 //
1104 // Flags start with "-", unless it comes after a bare "--" arg.
1105 {
1106 bool dash_dash = false;
1107 int a;
1108 for (a = 1; a < argc; a++) {
1109 char* arg = argv[a];
1110 if ((arg[0] == '-') && !dash_dash) {
1111 dash_dash = (arg[1] == '-') && (arg[2] == '\x00');
1112 continue;
1113 }
1114 input_file_descriptor = open(arg, O_RDONLY);
1115 if (input_file_descriptor < 0) {
1116 fprintf(stderr, "%s: %s\n", arg, strerror(errno));
1117 return 1;
1118 }
1119 break;
1120 }
1121 }
1122
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001123#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
1124 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
1125 sandboxed = true;
1126#endif
1127
Nigel Tao0cd2f982020-03-03 23:03:02 +11001128 const char* z = main1(argc, argv);
1129 if (wrote_to_dst) {
1130 const char* z1 = write_dst("\n", 1);
1131 const char* z2 = flush_dst();
1132 z = z ? z : (z1 ? z1 : z2);
1133 }
1134 int exit_code = compute_exit_code(z);
Nigel Taofe0cbbd2020-03-05 22:01:30 +11001135
1136#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
1137 // Call SYS_exit explicitly, instead of calling SYS_exit_group implicitly by
1138 // either calling _exit or returning from main. SECCOMP_MODE_STRICT allows
1139 // only SYS_exit.
1140 syscall(SYS_exit, exit_code);
1141#endif
Nigel Tao9cc2c252020-02-23 17:05:49 +11001142 return exit_code;
Nigel Tao1b073492020-02-16 22:11:36 +11001143}