blob: 70bc44ec678cd3f5893c15cf9992a43fe3fd9ecb [file] [log] [blame]
Nigel Tao1b073492020-02-16 22:11:36 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
Nigel Tao0cd2f982020-03-03 23:03:02 +110018jsonptr is a JSON formatter (pretty-printer) that supports the JSON Pointer
19(RFC 6901) query syntax. It reads UTF-8 JSON from stdin and writes
20canonicalized, formatted UTF-8 JSON to stdout.
21
22See the "const char* usage" string below for details.
23
24----
25
26JSON Pointer (and this program's implementation) is one of many JSON query
27languages and JSON tools, such as jq, jql and JMESPath. This one is relatively
28simple and fewer-featured compared to those others.
29
30One benefit of simplicity is that this program's JSON and JSON Pointer
31implementations do not dynamically allocate or free memory (yet it does not
32require that the entire input fits in memory at once). They are therefore
33trivially protected against certain bug classes: memory leaks, double-frees and
34use-after-frees.
35
36The core JSON implementation is also written in the Wuffs programming language
37(and then transpiled to C/C++), which is memory-safe but also guards against
38integer arithmetic overflows.
39
Nigel Taofe0cbbd2020-03-05 22:01:30 +110040For defense in depth, on Linux, this program also self-imposes a
41SECCOMP_MODE_STRICT sandbox before reading (or otherwise processing) its input
42or writing its output. Under this sandbox, the only permitted system calls are
43read, write, exit and sigreturn.
44
Nigel Tao0cd2f982020-03-03 23:03:02 +110045All together, this program aims to safely handle untrusted JSON files without
46fear of security bugs such as remote code execution.
47
48----
Nigel Tao1b073492020-02-16 22:11:36 +110049
Nigel Taoc5b3a9e2020-02-24 11:54:35 +110050As of 2020-02-24, this program passes all 318 "test_parsing" cases from the
51JSON test suite (https://github.com/nst/JSONTestSuite), an appendix to the
52"Parsing JSON is a Minefield" article (http://seriot.ch/parsing_json.php) that
53was first published on 2016-10-26 and updated on 2018-03-30.
54
Nigel Tao0cd2f982020-03-03 23:03:02 +110055After modifying this program, run "build-example.sh example/jsonptr/" and then
56"script/run-json-test-suite.sh" to catch correctness regressions.
57
58----
59
Nigel Tao1b073492020-02-16 22:11:36 +110060This example program differs from most other example Wuffs programs in that it
61is written in C++, not C.
62
63$CXX jsonptr.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
64
65for a C++ compiler $CXX, such as clang++ or g++.
66*/
67
Nigel Taofe0cbbd2020-03-05 22:01:30 +110068#include <errno.h>
Nigel Tao9cc2c252020-02-23 17:05:49 +110069#include <string.h>
Nigel Taofe0cbbd2020-03-05 22:01:30 +110070#include <unistd.h>
Nigel Tao1b073492020-02-16 22:11:36 +110071
72// Wuffs ships as a "single file C library" or "header file library" as per
73// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
74//
75// To use that single file as a "foo.c"-like implementation, instead of a
76// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
77// compiling it.
78#define WUFFS_IMPLEMENTATION
79
80// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
81// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
82// the entire Wuffs standard library, implementing a variety of codecs and file
83// formats. Without this macro definition, an optimizing compiler or linker may
84// very well discard Wuffs code for unused codecs, but listing the Wuffs
85// modules we use makes that process explicit. Preprocessing means that such
86// code simply isn't compiled.
87#define WUFFS_CONFIG__MODULES
88#define WUFFS_CONFIG__MODULE__BASE
89#define WUFFS_CONFIG__MODULE__JSON
90
91// If building this program in an environment that doesn't easily accommodate
92// relative includes, you can use the script/inline-c-relative-includes.go
93// program to generate a stand-alone C++ file.
94#include "../../release/c/wuffs-unsupported-snapshot.c"
95
Nigel Taofe0cbbd2020-03-05 22:01:30 +110096#if defined(__linux__)
97#include <linux/prctl.h>
98#include <linux/seccomp.h>
99#include <sys/prctl.h>
100#include <sys/syscall.h>
101#define WUFFS_EXAMPLE_USE_SECCOMP
102#endif
103
Nigel Tao2cf76db2020-02-27 22:42:01 +1100104#define TRY(error_msg) \
105 do { \
106 const char* z = error_msg; \
107 if (z) { \
108 return z; \
109 } \
110 } while (false)
111
112static const char* eod = "main: end of data";
113
Nigel Tao0cd2f982020-03-03 23:03:02 +1100114static const char* usage =
115 "Usage: jsonptr -flags < input.json\n"
116 "\n"
117 "Note the \"<\". It only reads from stdin, not named files.\n"
118 "\n"
119 "jsonptr is a JSON formatter (pretty-printer) that supports the JSON\n"
120 "Pointer (RFC 6901) query syntax. It reads UTF-8 JSON from stdin and\n"
121 "writes canonicalized, formatted UTF-8 JSON to stdout.\n"
122 "\n"
123 "Canonicalized means that e.g. \"abc\\u000A\\tx\\u0177z\" is re-written\n"
124 "as \"abc\\n\\txŷz\". It does not sort object keys, nor does it reject\n"
125 "duplicate keys.\n"
126 "\n"
127 "Formatted means that arrays' and objects' elements are indented, each\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100128 "on its own line. Configure this with the -c / -compact, -i=N / -indent=N\n"
129 "(for N ranging from 0 to 8) and -t / -tabs flags.\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100130 "\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100131 "The -q=etc or -query=etc flag gives an optional JSON Pointer query, to\n"
132 "print a subset of the input. For example, given RFC 6901 section 5's\n"
133 "[sample input](https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
Nigel Tao0cd2f982020-03-03 23:03:02 +1100134 " jsonptr -query=/foo/1 < rfc-6901-json-pointer.json\n"
135 "will print:\n"
136 " \"baz\"\n"
137 "\n"
138 "An absent query is equivalent to the empty query, which identifies the\n"
139 "entire input (the root value). The \"/\" query is not equivalent to the\n"
140 "root value. Instead, it identifies the child (the key-value pair) of the\n"
141 "root value whose key is the empty string.\n"
142 "\n"
143 "If the query found a valid JSON value, this program will return a zero\n"
144 "exit code even if the rest of the input isn't valid JSON. If the query\n"
145 "did not find a value, or found an invalid one, this program returns a\n"
146 "non-zero exit code, but may still print partial output to stdout.\n"
147 "\n"
148 "The [JSON specification](https://json.org/) permits implementations that\n"
149 "allow duplicate keys, as this one does. This JSON Pointer implementation\n"
150 "is also greedy, following the first match for each fragment without\n"
151 "back-tracking. For example, the \"/foo/bar\" query will fail if the root\n"
152 "object has multiple \"foo\" children but the first one doesn't have a\n"
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100153 "\"bar\" child, even if later ones do.\n"
154 "\n"
155 "The -fail-if-unsandboxed flag causes the program to exit if it does not\n"
156 "self-impose a sandbox. On Linux, it self-imposes a SECCOMP_MODE_STRICT\n"
157 "sandbox, regardless of this flag.";
Nigel Tao0cd2f982020-03-03 23:03:02 +1100158
Nigel Tao2cf76db2020-02-27 22:42:01 +1100159// ----
160
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100161bool sandboxed = false;
162
Nigel Tao2cf76db2020-02-27 22:42:01 +1100163#define MAX_INDENT 8
Nigel Tao107f0ef2020-03-01 21:35:02 +1100164#define INDENT_SPACES_STRING " "
Nigel Tao6e7d1412020-03-06 09:21:35 +1100165#define INDENT_TAB_STRING "\t"
Nigel Tao107f0ef2020-03-01 21:35:02 +1100166
Nigel Tao1b073492020-02-16 22:11:36 +1100167#ifndef DST_BUFFER_SIZE
168#define DST_BUFFER_SIZE (32 * 1024)
169#endif
170#ifndef SRC_BUFFER_SIZE
171#define SRC_BUFFER_SIZE (32 * 1024)
172#endif
173#ifndef TOKEN_BUFFER_SIZE
174#define TOKEN_BUFFER_SIZE (4 * 1024)
175#endif
176
Nigel Tao2cf76db2020-02-27 22:42:01 +1100177uint8_t dst_array[DST_BUFFER_SIZE];
178uint8_t src_array[SRC_BUFFER_SIZE];
179wuffs_base__token tok_array[TOKEN_BUFFER_SIZE];
Nigel Tao1b073492020-02-16 22:11:36 +1100180
181wuffs_base__io_buffer dst;
182wuffs_base__io_buffer src;
183wuffs_base__token_buffer tok;
184
Nigel Tao2cf76db2020-02-27 22:42:01 +1100185// curr_token_end_src_index is the src.data.ptr index of the end of the current
186// token. An invariant is that (curr_token_end_src_index <= src.meta.ri).
187size_t curr_token_end_src_index;
188
Nigel Tao0cd2f982020-03-03 23:03:02 +1100189uint32_t depth;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100190
191enum class context {
192 none,
193 in_list_after_bracket,
194 in_list_after_value,
195 in_dict_after_brace,
196 in_dict_after_key,
197 in_dict_after_value,
198} ctx;
199
Nigel Tao0cd2f982020-03-03 23:03:02 +1100200bool //
201in_dict_before_key() {
202 return (ctx == context::in_dict_after_brace) ||
203 (ctx == context::in_dict_after_value);
204}
205
206bool suppress_write_dst;
207bool wrote_to_dst;
208
Nigel Tao1b073492020-02-16 22:11:36 +1100209wuffs_json__decoder dec;
Nigel Tao1b073492020-02-16 22:11:36 +1100210
Nigel Tao0cd2f982020-03-03 23:03:02 +1100211// ----
212
213// Query is a JSON Pointer query. After initializing with a NUL-terminated C
214// string, its multiple fragments are consumed as the program walks the JSON
215// data from stdin. For example, letting "$" denote a NUL, suppose that we
216// started with a query string of "/apple/banana/12/durian" and are currently
217// trying to match the second fragment, "banana", so that Query::depth is 2:
218//
219// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
220// / a p p l e / b a n a n a / 1 2 / d u r i a n $
221// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
222// ^ ^
223// frag_i frag_k
224//
225// The two pointers frag_i and frag_k are the start (inclusive) and end
226// (exclusive) of the fragment. They satisfy (frag_i <= frag_k) and may be
227// equal if the fragment empty (note that "" is a valid JSON object key).
228//
229// The frag_j pointer moves between these two, or is nullptr. An invariant is
230// that (((frag_i <= frag_j) && (frag_j <= frag_k)) || (frag_j == nullptr)).
231//
232// Wuffs' JSON tokenizer can portray a single JSON string as multiple Wuffs
233// tokens, as backslash-escaped values within that JSON string may each get
234// their own token.
235//
236// At the start of each object key (a JSON string), frag_j is set to frag_i.
237//
238// While frag_j remains non-nullptr, each token's unescaped contents are then
239// compared to that part of the fragment from frag_j to frag_k. If it is a
240// prefix (including the case of an exact match), then frag_j is advanced by
241// the unescaped length. Otherwise, frag_j is set to nullptr.
242//
243// Comparison accounts for JSON Pointer's escaping notation: "~0" and "~1" in
244// the query (not the JSON value) are unescaped to "~" and "/" respectively.
245//
246// The frag_j pointer therefore advances from frag_i to frag_k, or drops out,
247// as we incrementally match the object key with the query fragment. For
248// example, if we have already matched the "ban" of "banana", then we would
249// accept any of an "ana" token, an "a" token or a "\u0061" token, amongst
250// others. They would advance frag_j by 3, 1 or 1 bytes respectively.
251//
252// frag_j
253// v
254// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
255// / a p p l e / b a n a n a / 1 2 / d u r i a n $
256// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
257// ^ ^
258// frag_i frag_k
259//
260// At the end of each object key (or equivalently, at the start of each object
261// value), if frag_j is non-nullptr and equal to (but not less than) frag_k
262// then we have a fragment match: the query fragment equals the object key. If
263// there is a next fragment (in this example, "12") we move the frag_etc
264// pointers to its start and end and increment Query::depth. Otherwise, we have
265// matched the complete query, and the upcoming JSON value is the result of
266// that query.
267//
268// The discussion above centers on object keys. If the query fragment is
269// numeric then it can also match as an array index: the string fragment "12"
270// will match an array's 13th element (starting counting from zero). See RFC
271// 6901 for its precise definition of an "array index" number.
272//
273// Array index fragment match is represented by the Query::array_index field,
274// whose type (wuffs_base__result_u64) is a result type. An error result means
275// that the fragment is not an array index. A value result holds the number of
276// list elements remaining. When matching a query fragment in an array (instead
277// of in an object), each element ticks this number down towards zero. At zero,
278// the upcoming JSON value is the one that matches the query fragment.
279class Query {
280 private:
281 uint8_t* frag_i;
282 uint8_t* frag_j;
283 uint8_t* frag_k;
284
285 uint32_t depth;
286
287 wuffs_base__result_u64 array_index;
288
289 public:
290 void reset(char* query_c_string) {
291 this->frag_i = (uint8_t*)query_c_string;
292 this->frag_j = (uint8_t*)query_c_string;
293 this->frag_k = (uint8_t*)query_c_string;
294 this->depth = 0;
295 this->array_index.status.repr = "#main: not an array index query fragment";
296 this->array_index.value = 0;
297 }
298
299 void restart_fragment(bool enable) {
300 this->frag_j = enable ? this->frag_i : nullptr;
301 }
302
303 bool is_at(uint32_t depth) { return this->depth == depth; }
304
305 // tick returns whether the fragment is a valid array index whose value is
306 // zero. If valid but non-zero, it decrements it and returns false.
307 bool tick() {
308 if (this->array_index.status.is_ok()) {
309 if (this->array_index.value == 0) {
310 return true;
311 }
312 this->array_index.value--;
313 }
314 return false;
315 }
316
317 // next_fragment moves to the next fragment, returning whether it existed.
318 bool next_fragment() {
319 uint8_t* k = this->frag_k;
320 uint32_t d = this->depth;
321
322 this->reset(nullptr);
323
324 if (!k || (*k != '/')) {
325 return false;
326 }
327 k++;
328
329 bool all_digits = true;
330 uint8_t* i = k;
331 while ((*k != '\x00') && (*k != '/')) {
332 all_digits = all_digits && ('0' <= *k) && (*k <= '9');
333 k++;
334 }
335 this->frag_i = i;
336 this->frag_j = i;
337 this->frag_k = k;
338 this->depth = d + 1;
339 if (all_digits) {
340 // wuffs_base__parse_number_u64 rejects leading zeroes, e.g. "00", "07".
341 this->array_index =
342 wuffs_base__parse_number_u64(wuffs_base__make_slice_u8(i, k - i));
343 }
344 return true;
345 }
346
347 bool matched() { return this->frag_j && (this->frag_j == this->frag_k); }
348
349 void incremental_match_slice(uint8_t* ptr, size_t len) {
350 if (!this->frag_j) {
351 return;
352 }
353 uint8_t* j = this->frag_j;
354 while (true) {
355 if (len == 0) {
356 this->frag_j = j;
357 return;
358 }
359
360 if (*j == '\x00') {
361 break;
362
363 } else if (*j == '~') {
364 j++;
365 if (*j == '0') {
366 if (*ptr != '~') {
367 break;
368 }
369 } else if (*j == '1') {
370 if (*ptr != '/') {
371 break;
372 }
373 } else {
374 break;
375 }
376
377 } else if (*j != *ptr) {
378 break;
379 }
380
381 j++;
382 ptr++;
383 len--;
384 }
385 this->frag_j = nullptr;
386 }
387
388 void incremental_match_code_point(uint32_t code_point) {
389 if (!this->frag_j) {
390 return;
391 }
392 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
393 size_t n = wuffs_base__utf_8__encode(
394 wuffs_base__make_slice_u8(&u[0],
395 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
396 code_point);
397 if (n > 0) {
398 this->incremental_match_slice(&u[0], n);
399 }
400 }
401
402 // validate returns whether the (ptr, len) arguments form a valid JSON
403 // Pointer. In particular, it must be valid UTF-8, and either be empty or
404 // start with a '/'. Any '~' within must immediately be followed by either
405 // '0' or '1'.
406 static bool validate(char* query_c_string, size_t length) {
407 if (length <= 0) {
408 return true;
409 }
410 if (query_c_string[0] != '/') {
411 return false;
412 }
413 wuffs_base__slice_u8 s =
414 wuffs_base__make_slice_u8((uint8_t*)query_c_string, length);
415 bool previous_was_tilde = false;
416 while (s.len > 0) {
417 wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);
418 if (!o.is_valid()) {
419 return false;
420 }
421 if (previous_was_tilde && (o.code_point != '0') &&
422 (o.code_point != '1')) {
423 return false;
424 }
425 previous_was_tilde = o.code_point == '~';
426 s.ptr += o.byte_length;
427 s.len -= o.byte_length;
428 }
429 return !previous_was_tilde;
430 }
431} query;
432
433// ----
434
Nigel Tao68920952020-03-03 11:25:18 +1100435struct {
436 int remaining_argc;
437 char** remaining_argv;
438
439 bool compact;
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100440 bool fail_if_unsandboxed;
Nigel Tao68920952020-03-03 11:25:18 +1100441 size_t indent;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100442 char* query_c_string;
Nigel Tao68920952020-03-03 11:25:18 +1100443 bool tabs;
444} flags = {0};
445
446const char* //
447parse_flags(int argc, char** argv) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100448 flags.indent = 4;
Nigel Tao68920952020-03-03 11:25:18 +1100449
450 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
451 for (; c < argc; c++) {
452 char* arg = argv[c];
453 if (*arg++ != '-') {
454 break;
455 }
456
457 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
458 // cases, a bare "-" is not a flag (some programs may interpret it as
459 // stdin) and a bare "--" means to stop parsing flags.
460 if (*arg == '\x00') {
461 break;
462 } else if (*arg == '-') {
463 arg++;
464 if (*arg == '\x00') {
465 c++;
466 break;
467 }
468 }
469
470 if (!strcmp(arg, "c") || !strcmp(arg, "compact")) {
471 flags.compact = true;
472 continue;
473 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100474 if (!strcmp(arg, "fail-if-unsandboxed")) {
475 flags.fail_if_unsandboxed = true;
476 continue;
477 }
Nigel Tao68920952020-03-03 11:25:18 +1100478 if (!strncmp(arg, "i=", 2) || !strncmp(arg, "indent=", 7)) {
479 while (*arg++ != '=') {
480 }
481 if (('0' <= arg[0]) && (arg[0] <= '8') && (arg[1] == '\x00')) {
482 flags.indent = arg[0] - '0';
Nigel Tao68920952020-03-03 11:25:18 +1100483 continue;
484 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100485 return usage;
486 }
487 if (!strncmp(arg, "q=", 2) || !strncmp(arg, "query=", 6)) {
488 while (*arg++ != '=') {
489 }
490 if (Query::validate(arg, strlen(arg))) {
491 flags.query_c_string = arg;
492 continue;
493 }
494 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100495 }
496 if (!strcmp(arg, "t") || !strcmp(arg, "tabs")) {
497 flags.tabs = true;
498 continue;
499 }
500
Nigel Tao0cd2f982020-03-03 23:03:02 +1100501 return usage;
Nigel Tao68920952020-03-03 11:25:18 +1100502 }
503
504 flags.remaining_argc = argc - c;
505 flags.remaining_argv = argv + c;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100506 return nullptr;
Nigel Tao68920952020-03-03 11:25:18 +1100507}
508
Nigel Tao2cf76db2020-02-27 22:42:01 +1100509const char* //
510initialize_globals(int argc, char** argv) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100511 dst = wuffs_base__make_io_buffer(
512 wuffs_base__make_slice_u8(dst_array, DST_BUFFER_SIZE),
513 wuffs_base__empty_io_buffer_meta());
Nigel Tao1b073492020-02-16 22:11:36 +1100514
Nigel Tao2cf76db2020-02-27 22:42:01 +1100515 src = wuffs_base__make_io_buffer(
516 wuffs_base__make_slice_u8(src_array, SRC_BUFFER_SIZE),
517 wuffs_base__empty_io_buffer_meta());
518
519 tok = wuffs_base__make_token_buffer(
520 wuffs_base__make_slice_token(tok_array, TOKEN_BUFFER_SIZE),
521 wuffs_base__empty_token_buffer_meta());
522
523 curr_token_end_src_index = 0;
524
Nigel Tao2cf76db2020-02-27 22:42:01 +1100525 depth = 0;
526
527 ctx = context::none;
528
Nigel Tao68920952020-03-03 11:25:18 +1100529 TRY(parse_flags(argc, argv));
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100530 if (flags.fail_if_unsandboxed && !sandboxed) {
531 return "main: unsandboxed";
532 }
Nigel Tao68920952020-03-03 11:25:18 +1100533 if (flags.remaining_argc > 0) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100534 return usage;
Nigel Tao107f0ef2020-03-01 21:35:02 +1100535 }
536
Nigel Tao0cd2f982020-03-03 23:03:02 +1100537 query.reset(flags.query_c_string);
538
539 // If the query is non-empty, suprress writing to stdout until we've
540 // completed the query.
541 suppress_write_dst = query.next_fragment();
542 wrote_to_dst = false;
543
Nigel Tao2cf76db2020-02-27 22:42:01 +1100544 return dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0)
545 .message();
546}
Nigel Tao1b073492020-02-16 22:11:36 +1100547
548// ----
549
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100550// ignore_return_value suppresses errors from -Wall -Werror.
551static void //
552ignore_return_value(int ignored) {}
553
Nigel Tao2914bae2020-02-26 09:40:30 +1100554const char* //
555read_src() {
Nigel Taoa8406922020-02-19 12:22:00 +1100556 if (src.meta.closed) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100557 return "main: internal error: read requested on a closed source";
Nigel Taoa8406922020-02-19 12:22:00 +1100558 }
Nigel Tao1b073492020-02-16 22:11:36 +1100559 src.compact();
560 if (src.meta.wi >= src.data.len) {
561 return "main: src buffer is full";
562 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100563 while (true) {
564 const int stdin_fd = 0;
565 ssize_t n =
566 read(stdin_fd, src.data.ptr + src.meta.wi, src.data.len - src.meta.wi);
567 if (n >= 0) {
568 src.meta.wi += n;
569 src.meta.closed = n == 0;
570 break;
571 } else if (errno != EINTR) {
572 return strerror(errno);
573 }
Nigel Tao1b073492020-02-16 22:11:36 +1100574 }
575 return nullptr;
576}
577
Nigel Tao2914bae2020-02-26 09:40:30 +1100578const char* //
579flush_dst() {
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100580 while (true) {
581 size_t n = dst.meta.wi - dst.meta.ri;
582 if (n == 0) {
583 break;
Nigel Tao1b073492020-02-16 22:11:36 +1100584 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100585 const int stdout_fd = 1;
586 ssize_t i = write(stdout_fd, dst.data.ptr + dst.meta.ri, n);
587 if (i >= 0) {
588 dst.meta.ri += i;
589 } else if (errno != EINTR) {
590 return strerror(errno);
591 }
Nigel Tao1b073492020-02-16 22:11:36 +1100592 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100593 dst.compact();
Nigel Tao1b073492020-02-16 22:11:36 +1100594 return nullptr;
595}
596
Nigel Tao2914bae2020-02-26 09:40:30 +1100597const char* //
598write_dst(const void* s, size_t n) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100599 if (suppress_write_dst) {
600 return nullptr;
601 }
Nigel Tao1b073492020-02-16 22:11:36 +1100602 const uint8_t* p = static_cast<const uint8_t*>(s);
603 while (n > 0) {
604 size_t i = dst.writer_available();
605 if (i == 0) {
606 const char* z = flush_dst();
607 if (z) {
608 return z;
609 }
610 i = dst.writer_available();
611 if (i == 0) {
612 return "main: dst buffer is full";
613 }
614 }
615
616 if (i > n) {
617 i = n;
618 }
619 memcpy(dst.data.ptr + dst.meta.wi, p, i);
620 dst.meta.wi += i;
621 p += i;
622 n -= i;
Nigel Tao0cd2f982020-03-03 23:03:02 +1100623 wrote_to_dst = true;
Nigel Tao1b073492020-02-16 22:11:36 +1100624 }
625 return nullptr;
626}
627
628// ----
629
Nigel Tao2914bae2020-02-26 09:40:30 +1100630uint8_t //
631hex_digit(uint8_t nibble) {
Nigel Taob5461bd2020-02-21 14:13:37 +1100632 nibble &= 0x0F;
633 if (nibble <= 9) {
634 return '0' + nibble;
635 }
636 return ('A' - 10) + nibble;
637}
638
Nigel Tao2914bae2020-02-26 09:40:30 +1100639const char* //
Nigel Tao3b486982020-02-27 15:05:59 +1100640handle_unicode_code_point(uint32_t ucp) {
641 if (ucp < 0x0020) {
642 switch (ucp) {
643 case '\b':
644 return write_dst("\\b", 2);
645 case '\f':
646 return write_dst("\\f", 2);
647 case '\n':
648 return write_dst("\\n", 2);
649 case '\r':
650 return write_dst("\\r", 2);
651 case '\t':
652 return write_dst("\\t", 2);
653 default: {
654 // Other bytes less than 0x0020 are valid UTF-8 but not valid in a
655 // JSON string. They need to remain escaped.
656 uint8_t esc6[6];
657 esc6[0] = '\\';
658 esc6[1] = 'u';
659 esc6[2] = '0';
660 esc6[3] = '0';
661 esc6[4] = hex_digit(ucp >> 4);
662 esc6[5] = hex_digit(ucp >> 0);
663 return write_dst(&esc6[0], 6);
664 }
665 }
666
Nigel Taob9ad34f2020-03-03 12:44:01 +1100667 } else if (ucp == '\"') {
668 return write_dst("\\\"", 2);
669
670 } else if (ucp == '\\') {
671 return write_dst("\\\\", 2);
672
673 } else {
674 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
675 size_t n = wuffs_base__utf_8__encode(
676 wuffs_base__make_slice_u8(&u[0],
677 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
678 ucp);
679 if (n > 0) {
680 return write_dst(&u[0], n);
Nigel Tao3b486982020-02-27 15:05:59 +1100681 }
Nigel Tao3b486982020-02-27 15:05:59 +1100682 }
683
Nigel Tao2cf76db2020-02-27 22:42:01 +1100684 return "main: internal error: unexpected Unicode code point";
Nigel Tao3b486982020-02-27 15:05:59 +1100685}
686
687const char* //
Nigel Tao2cf76db2020-02-27 22:42:01 +1100688handle_token(wuffs_base__token t) {
689 do {
690 uint64_t vbc = t.value_base_category();
691 uint64_t vbd = t.value_base_detail();
692 uint64_t len = t.length();
Nigel Tao1b073492020-02-16 22:11:36 +1100693
694 // Handle ']' or '}'.
Nigel Tao9f7a2502020-02-23 09:42:02 +1100695 if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
Nigel Tao2cf76db2020-02-27 22:42:01 +1100696 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100697 if (query.is_at(depth)) {
698 return "main: no match for query";
699 }
Nigel Tao1b073492020-02-16 22:11:36 +1100700 if (depth <= 0) {
701 return "main: internal error: inconsistent depth";
702 }
703 depth--;
704
705 // Write preceding whitespace.
706 if ((ctx != context::in_list_after_bracket) &&
Nigel Tao68920952020-03-03 11:25:18 +1100707 (ctx != context::in_dict_after_brace) && !flags.compact) {
Nigel Tao1b073492020-02-16 22:11:36 +1100708 TRY(write_dst("\n", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100709 for (uint32_t i = 0; i < depth; i++) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100710 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
711 flags.tabs ? 1 : flags.indent));
Nigel Tao1b073492020-02-16 22:11:36 +1100712 }
713 }
714
Nigel Tao9f7a2502020-02-23 09:42:02 +1100715 TRY(write_dst(
716 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST) ? "]" : "}", 1));
717 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
718 ? context::in_list_after_value
719 : context::in_dict_after_key;
Nigel Tao1b073492020-02-16 22:11:36 +1100720 goto after_value;
721 }
722
Nigel Taod1c928a2020-02-28 12:43:53 +1100723 // Write preceding whitespace and punctuation, if it wasn't ']', '}' or a
724 // continuation of a multi-token chain.
Nigel Tao0cd2f982020-03-03 23:03:02 +1100725 if (!t.link_prev()) {
726 if (ctx == context::in_dict_after_key) {
727 TRY(write_dst(": ", flags.compact ? 1 : 2));
728 } else if (ctx != context::none) {
729 if ((ctx != context::in_list_after_bracket) &&
730 (ctx != context::in_dict_after_brace)) {
731 TRY(write_dst(",", 1));
Nigel Tao107f0ef2020-03-01 21:35:02 +1100732 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100733 if (!flags.compact) {
734 TRY(write_dst("\n", 1));
735 for (size_t i = 0; i < depth; i++) {
Nigel Tao6e7d1412020-03-06 09:21:35 +1100736 TRY(write_dst(flags.tabs ? INDENT_TAB_STRING : INDENT_SPACES_STRING,
737 flags.tabs ? 1 : flags.indent));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100738 }
739 }
740 }
741
742 bool query_matched = false;
743 if (query.is_at(depth)) {
744 switch (ctx) {
745 case context::in_list_after_bracket:
746 case context::in_list_after_value:
747 query_matched = query.tick();
748 break;
749 case context::in_dict_after_key:
750 query_matched = query.matched();
751 break;
752 }
753 }
754 if (!query_matched) {
755 // No-op.
756 } else if (!query.next_fragment()) {
757 // There is no next fragment. We have matched the complete query, and
758 // the upcoming JSON value is the result of that query.
759 //
760 // Un-suppress writing to stdout and reset the ctx and depth as if we
761 // were about to decode a top-level value. This makes any subsequent
762 // indentation be relative to this point, and we will return eod after
763 // the upcoming JSON value is complete.
764 suppress_write_dst = false;
765 ctx = context::none;
766 depth = 0;
767 } else if ((vbc != WUFFS_BASE__TOKEN__VBC__STRUCTURE) ||
768 !(vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH)) {
769 // The query has moved on to the next fragment but the upcoming JSON
770 // value is not a container.
771 return "main: no match for query";
Nigel Tao1b073492020-02-16 22:11:36 +1100772 }
773 }
774
775 // Handle the token itself: either a container ('[' or '{') or a simple
Nigel Tao85fba7f2020-02-29 16:28:06 +1100776 // value: string (a chain of raw or escaped parts), literal or number.
Nigel Tao1b073492020-02-16 22:11:36 +1100777 switch (vbc) {
Nigel Tao85fba7f2020-02-29 16:28:06 +1100778 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
779 TRY(write_dst(
780 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) ? "[" : "{", 1));
781 depth++;
782 ctx = (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST)
783 ? context::in_list_after_bracket
784 : context::in_dict_after_brace;
785 return nullptr;
786
Nigel Tao2cf76db2020-02-27 22:42:01 +1100787 case WUFFS_BASE__TOKEN__VBC__STRING:
Nigel Taod1c928a2020-02-28 12:43:53 +1100788 if (!t.link_prev()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100789 TRY(write_dst("\"", 1));
Nigel Tao0cd2f982020-03-03 23:03:02 +1100790 query.restart_fragment(in_dict_before_key() && query.is_at(depth));
Nigel Tao2cf76db2020-02-27 22:42:01 +1100791 }
Nigel Taocb37a562020-02-28 09:56:24 +1100792
793 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
794 // No-op.
795 } else if (vbd &
796 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100797 uint8_t* ptr = src.data.ptr + curr_token_end_src_index - len;
798 TRY(write_dst(ptr, len));
799 query.incremental_match_slice(ptr, len);
Nigel Taocb37a562020-02-28 09:56:24 +1100800 } else {
801 return "main: internal error: unexpected string-token conversion";
802 }
803
Nigel Taod1c928a2020-02-28 12:43:53 +1100804 if (t.link_next()) {
Nigel Tao2cf76db2020-02-27 22:42:01 +1100805 return nullptr;
806 }
807 TRY(write_dst("\"", 1));
808 goto after_value;
809
810 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT:
Nigel Tao0cd2f982020-03-03 23:03:02 +1100811 if (!t.link_prev() || !t.link_next()) {
812 return "main: internal error: unexpected unlinked token";
813 }
814 TRY(handle_unicode_code_point(vbd));
815 query.incremental_match_code_point(vbd);
816 return nullptr;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100817
Nigel Tao85fba7f2020-02-29 16:28:06 +1100818 case WUFFS_BASE__TOKEN__VBC__LITERAL:
Nigel Tao2cf76db2020-02-27 22:42:01 +1100819 case WUFFS_BASE__TOKEN__VBC__NUMBER:
820 TRY(write_dst(src.data.ptr + curr_token_end_src_index - len, len));
821 goto after_value;
Nigel Tao1b073492020-02-16 22:11:36 +1100822 }
823
824 // Return an error if we didn't match the (vbc, vbd) pair.
Nigel Tao2cf76db2020-02-27 22:42:01 +1100825 return "main: internal error: unexpected token";
826 } while (0);
Nigel Tao1b073492020-02-16 22:11:36 +1100827
Nigel Tao2cf76db2020-02-27 22:42:01 +1100828 // Book-keeping after completing a value (whether a container value or a
829 // simple value). Empty parent containers are no longer empty. If the parent
830 // container is a "{...}" object, toggle between keys and values.
831after_value:
832 if (depth == 0) {
833 return eod;
834 }
835 switch (ctx) {
836 case context::in_list_after_bracket:
837 ctx = context::in_list_after_value;
838 break;
839 case context::in_dict_after_brace:
840 ctx = context::in_dict_after_key;
841 break;
842 case context::in_dict_after_key:
843 ctx = context::in_dict_after_value;
844 break;
845 case context::in_dict_after_value:
846 ctx = context::in_dict_after_key;
847 break;
848 }
849 return nullptr;
850}
851
852const char* //
853main1(int argc, char** argv) {
854 TRY(initialize_globals(argc, argv));
855
856 while (true) {
857 wuffs_base__status status = dec.decode_tokens(&tok, &src);
858
859 while (tok.meta.ri < tok.meta.wi) {
860 wuffs_base__token t = tok.data.ptr[tok.meta.ri++];
861 uint64_t n = t.length();
862 if ((src.meta.ri - curr_token_end_src_index) < n) {
863 return "main: internal error: inconsistent src indexes";
864 }
865 curr_token_end_src_index += n;
866
867 if (t.value() == 0) {
868 continue;
869 }
870
871 const char* z = handle_token(t);
872 if (z == nullptr) {
873 continue;
874 } else if (z == eod) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100875 goto end_of_data;
Nigel Tao2cf76db2020-02-27 22:42:01 +1100876 }
877 return z;
Nigel Tao1b073492020-02-16 22:11:36 +1100878 }
Nigel Tao2cf76db2020-02-27 22:42:01 +1100879
880 if (status.repr == nullptr) {
Nigel Tao0cd2f982020-03-03 23:03:02 +1100881 return "main: internal error: unexpected end of token stream";
Nigel Tao2cf76db2020-02-27 22:42:01 +1100882 } else if (status.repr == wuffs_base__suspension__short_read) {
883 if (curr_token_end_src_index != src.meta.ri) {
884 return "main: internal error: inconsistent src indexes";
885 }
886 TRY(read_src());
887 curr_token_end_src_index = src.meta.ri;
888 } else if (status.repr == wuffs_base__suspension__short_write) {
889 tok.compact();
890 } else {
891 return status.message();
Nigel Tao1b073492020-02-16 22:11:36 +1100892 }
893 }
Nigel Tao0cd2f982020-03-03 23:03:02 +1100894end_of_data:
895
896 // With a non-empty query, don't try to consume trailing whitespace or
897 // confirm that we've processed all the tokens.
898 if (flags.query_c_string && *flags.query_c_string) {
899 return nullptr;
900 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100901
Nigel Tao6b161af2020-02-24 11:01:48 +1100902 // Consume an optional whitespace trailer. This isn't part of the JSON spec,
903 // but it works better with line oriented Unix tools (such as "echo 123 |
904 // jsonptr" where it's "echo", not "echo -n") or hand-edited JSON files which
905 // can accidentally contain trailing whitespace.
906 //
907 // A whitespace trailer is zero or more ' ' and then zero or one '\n'.
908 while (true) {
909 if (src.meta.ri < src.meta.wi) {
910 uint8_t c = src.data.ptr[src.meta.ri];
911 if (c == ' ') {
912 src.meta.ri++;
913 continue;
914 } else if (c == '\n') {
915 src.meta.ri++;
916 break;
917 }
918 // The "exhausted the input" check below will fail.
919 break;
920 } else if (src.meta.closed) {
921 break;
922 }
923 TRY(read_src());
924 }
925
926 // Check that we've exhausted the input.
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100927 if ((src.meta.ri == src.meta.wi) && !src.meta.closed) {
928 TRY(read_src());
929 }
Nigel Tao6b161af2020-02-24 11:01:48 +1100930 if ((src.meta.ri < src.meta.wi) || !src.meta.closed) {
931 return "main: valid JSON followed by further (unexpected) data";
932 }
933
934 // Check that we've used all of the decoded tokens, other than trailing
935 // filler tokens. For example, a bare `"foo"` string is valid JSON, but even
936 // without a trailing '\n', the Wuffs JSON parser emits a filler token for
937 // the final '\"'.
938 for (; tok.meta.ri < tok.meta.wi; tok.meta.ri++) {
939 if (tok.data.ptr[tok.meta.ri].value_base_category() !=
940 WUFFS_BASE__TOKEN__VBC__FILLER) {
941 return "main: internal error: decoded OK but unprocessed tokens remain";
942 }
943 }
944
945 return nullptr;
Nigel Tao1b073492020-02-16 22:11:36 +1100946}
947
Nigel Tao2914bae2020-02-26 09:40:30 +1100948int //
949compute_exit_code(const char* status_msg) {
Nigel Tao9cc2c252020-02-23 17:05:49 +1100950 if (!status_msg) {
951 return 0;
952 }
953 size_t n = strnlen(status_msg, 2047);
954 if (n >= 2047) {
955 status_msg = "main: internal error: error message is too long";
956 n = strnlen(status_msg, 2047);
957 }
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100958 const int stderr_fd = 2;
959 ignore_return_value(write(stderr_fd, status_msg, n));
960 ignore_return_value(write(stderr_fd, "\n", 1));
Nigel Tao9cc2c252020-02-23 17:05:49 +1100961 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
962 // formatted or unsupported input.
963 //
964 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
965 // run-time checks found that an internal invariant did not hold.
966 //
967 // Automated testing, including badly formatted inputs, can therefore
968 // discriminate between expected failure (exit code 1) and unexpected failure
969 // (other non-zero exit codes). Specifically, exit code 2 for internal
970 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
971 // linux) for a segmentation fault (e.g. null pointer dereference).
972 return strstr(status_msg, "internal error:") ? 2 : 1;
973}
974
Nigel Tao2914bae2020-02-26 09:40:30 +1100975int //
976main(int argc, char** argv) {
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100977#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
978 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
979 sandboxed = true;
980#endif
981
Nigel Tao0cd2f982020-03-03 23:03:02 +1100982 const char* z = main1(argc, argv);
983 if (wrote_to_dst) {
984 const char* z1 = write_dst("\n", 1);
985 const char* z2 = flush_dst();
986 z = z ? z : (z1 ? z1 : z2);
987 }
988 int exit_code = compute_exit_code(z);
Nigel Taofe0cbbd2020-03-05 22:01:30 +1100989
990#if defined(WUFFS_EXAMPLE_USE_SECCOMP)
991 // Call SYS_exit explicitly, instead of calling SYS_exit_group implicitly by
992 // either calling _exit or returning from main. SECCOMP_MODE_STRICT allows
993 // only SYS_exit.
994 syscall(SYS_exit, exit_code);
995#endif
Nigel Tao9cc2c252020-02-23 17:05:49 +1100996 return exit_code;
Nigel Tao1b073492020-02-16 22:11:36 +1100997}