blob: 0c3a999aaa17a101b4e5a4b104e9a45b64a38ee3 [file] [log] [blame]
Nigel Taod0b16cb2020-03-14 10:15:54 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
18jsonfindptrs reads UTF-8 JSON from stdin and writes every node's JSON Pointer
19(RFC 6901) to stdout.
20
Nigel Taod60815c2020-03-26 14:32:35 +110021See the "const char* g_usage" string below for details.
Nigel Taod0b16cb2020-03-14 10:15:54 +110022
23----
24
25This program uses Wuffs' JSON decoder at a relatively high level, building
26in-memory representations of JSON 'things' (e.g. numbers, strings, objects).
27After the entire input has been converted, walking the tree prints the output
28(in sorted order). The core conversion mechanism is to call JsonThing::parse,
29which consumes a variable number of tokens (the output of Wuffs' JSON decoder).
30JsonThing::parse can call itself recursively, as JSON values can nest.
31
32This approach is centered around JSON things. Each JSON thing comprises one or
33more JSON tokens.
34
35An alternative, lower-level approach is in the sibling example/jsonptr program.
36Neither approach is better or worse per se, but when studying this program, be
37aware that there are multiple ways to use Wuffs' JSON decoder.
38
39The two programs, jsonfindptrs and jsonptr, also demonstrate different
40trade-offs with regard to JSON object duplicate keys. The JSON spec permits
41different implementations to allow or reject duplicate keys. It is not always
42clear which approach is safer. Rejecting them is certainly unambiguous, and
43security bugs can lurk in ambiguous corners of a file format, if two different
44implementations both silently accept a file but differ on how to interpret it.
45On the other hand, in the worst case, detecting duplicate keys requires O(N)
46memory, where N is the size of the (potentially untrusted) input.
47
48This program (jsonfindptrs) rejects duplicate keys.
49
50----
51
52This example program differs from most other example Wuffs programs in that it
53is written in C++, not C.
54
55$CXX jsonfindptrs.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
56
57for a C++ compiler $CXX, such as clang++ or g++.
58*/
59
Nigel Tao721190a2020-04-03 22:25:21 +110060#if defined(__cplusplus) && (__cplusplus < 201103L)
61#error "This C++ program requires -std=c++11 or later"
62#endif
63
Nigel Taod0b16cb2020-03-14 10:15:54 +110064#include <errno.h>
65#include <fcntl.h>
66#include <unistd.h>
Nigel Tao6b7ce302020-07-07 16:19:46 +100067
Nigel Taod0b16cb2020-03-14 10:15:54 +110068#include <iostream>
69#include <map>
70#include <string>
71#include <vector>
72
73// Wuffs ships as a "single file C library" or "header file library" as per
74// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
75//
76// To use that single file as a "foo.c"-like implementation, instead of a
77// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
78// compiling it.
79#define WUFFS_IMPLEMENTATION
80
81// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
82// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
83// the entire Wuffs standard library, implementing a variety of codecs and file
84// formats. Without this macro definition, an optimizing compiler or linker may
85// very well discard Wuffs code for unused codecs, but listing the Wuffs
86// modules we use makes that process explicit. Preprocessing means that such
87// code simply isn't compiled.
88#define WUFFS_CONFIG__MODULES
89#define WUFFS_CONFIG__MODULE__BASE
90#define WUFFS_CONFIG__MODULE__JSON
91
92// If building this program in an environment that doesn't easily accommodate
93// relative includes, you can use the script/inline-c-relative-includes.go
94// program to generate a stand-alone C++ file.
95#include "../../release/c/wuffs-unsupported-snapshot.c"
96
97#define TRY(error_msg) \
98 do { \
99 std::string z = error_msg; \
100 if (!z.empty()) { \
101 return z; \
102 } \
103 } while (false)
104
Nigel Taod60815c2020-03-26 14:32:35 +1100105static const char* g_usage =
Nigel Taod0b16cb2020-03-14 10:15:54 +1100106 "Usage: jsonfindptrs -flags input.json\n"
107 "\n"
108 "Flags:\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100109 " -d=NUM -max-output-depth=NUM\n"
Nigel Taoc766bb72020-07-09 12:59:32 +1000110 " -input-json-extra-comma\n"
Nigel Taoecadf722020-07-13 08:22:34 +1000111 " -strict-json-pointer-syntax\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100112 "\n"
113 "The input.json filename is optional. If absent, it reads from stdin.\n"
114 "\n"
115 "----\n"
116 "\n"
117 "jsonfindptrs reads UTF-8 JSON from stdin and writes every node's JSON\n"
118 "Pointer (RFC 6901) to stdout.\n"
119 "\n"
120 "For example, given RFC 6901 section 5's sample input\n"
121 "(https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
122 " jsonfindptrs rfc-6901-json-pointer.json\n"
123 "will print:\n"
124 " \n"
125 " /\n"
126 " / \n"
127 " /a~1b\n"
128 " /c%d\n"
129 " /e^f\n"
130 " /foo\n"
131 " /foo/0\n"
132 " /foo/1\n"
133 " /g|h\n"
134 " /i\\j\n"
135 " /k\"l\n"
136 " /m~0n\n"
137 "\n"
138 "The first three lines are (1) a 0-byte \"\", (2) a 1-byte \"/\" and (3)\n"
139 "a 2-byte \"/ \". Unlike a file system, the \"/\" JSON Pointer does not\n"
140 "identify the root. Instead, \"\" is the root and \"/\" is the child (the\n"
141 "value in a key-value pair) of the root whose key is the empty string.\n"
142 "Similarly, \"/xyz\" and \"/xyz/\" are two different nodes.\n"
143 "\n"
144 "----\n"
145 "\n"
146 "The JSON specification (https://json.org/) permits implementations that\n"
147 "allow duplicate keys, but this one does not. Conversely, it prints keys\n"
148 "in sorted order, but the overall output is not necessarily sorted\n"
149 "lexicographically. For example, \"/a/9\" would come before \"/a/10\",\n"
150 "and \"/b/c\", a child of \"/b\", would come before \"/b+\".\n"
151 "\n"
152 "This JSON implementation also rejects integer values outside ±M, where\n"
153 "M is ((1<<53)-1), also known as JavaScript's Number.MAX_SAFE_INTEGER.\n"
154 "\n"
Nigel Taoc766bb72020-07-09 12:59:32 +1000155 "The -input-json-extra-comma flag allows input like \"[1,2,]\", with a\n"
156 "comma after the final element of a JSON list or dictionary.\n"
157 "\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100158 "----\n"
159 "\n"
Nigel Taoecadf722020-07-13 08:22:34 +1000160 "The -strict-json-pointer-syntax flag restricts the output lines to\n"
161 "exactly RFC 6901, with only two escape sequences: \"~0\" and \"~1\" for\n"
162 "\"~\" and \"/\". Without this flag, this program also lets \"~n\" and\n"
163 "\"~r\" escape the New Line and Carriage Return ASCII control characters,\n"
164 "which can work better with line oriented Unix tools that assume exactly\n"
165 "one value (i.e. one JSON Pointer string) per line. With this flag, it\n"
166 "fails if the input JSON's keys contain \"\\u000A\" or \"\\u000D\".\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100167 "\n"
168 "----\n"
169 "\n"
170 "The JSON specification permits implementations to set their own maximum\n"
171 "input depth. This JSON implementation sets it to 1024.\n"
172 "\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100173 "The -d=NUM or -max-output-depth=NUM flag gives the maximum (inclusive)\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100174 "output depth. JSON containers ([] arrays and {} objects) can hold other\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100175 "containers. A bare -d or -max-output-depth is equivalent to -d=1,\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100176 "analogous to the Unix ls command. The flag's absence is equivalent to an\n"
177 "unlimited output depth, analogous to the Unix find command (and hence\n"
178 "the name of this program: jsonfindptrs).";
179
180// ----
181
182struct {
183 int remaining_argc;
184 char** remaining_argv;
185
Nigel Taoc766bb72020-07-09 12:59:32 +1000186 bool input_json_extra_comma;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100187 uint32_t max_output_depth;
188 bool strict_json_pointer_syntax;
Nigel Taod60815c2020-03-26 14:32:35 +1100189} g_flags = {0};
Nigel Taod0b16cb2020-03-14 10:15:54 +1100190
191std::string //
192parse_flags(int argc, char** argv) {
Nigel Taod60815c2020-03-26 14:32:35 +1100193 g_flags.max_output_depth = 0xFFFFFFFF;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100194
195 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
196 for (; c < argc; c++) {
197 char* arg = argv[c];
198 if (*arg++ != '-') {
199 break;
200 }
201
202 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
203 // cases, a bare "-" is not a flag (some programs may interpret it as
204 // stdin) and a bare "--" means to stop parsing flags.
205 if (*arg == '\x00') {
206 break;
207 } else if (*arg == '-') {
208 arg++;
209 if (*arg == '\x00') {
210 c++;
211 break;
212 }
213 }
214
Nigel Tao94440cf2020-04-02 22:28:24 +1100215 if (!strcmp(arg, "d") || !strcmp(arg, "max-output-depth")) {
Nigel Taod60815c2020-03-26 14:32:35 +1100216 g_flags.max_output_depth = 1;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100217 continue;
Nigel Tao94440cf2020-04-02 22:28:24 +1100218 } else if (!strncmp(arg, "d=", 2) ||
Nigel Taod0b16cb2020-03-14 10:15:54 +1100219 !strncmp(arg, "max-output-depth=", 16)) {
220 while (*arg++ != '=') {
221 }
222 wuffs_base__result_u64 u = wuffs_base__parse_number_u64(
Nigel Tao6b7ce302020-07-07 16:19:46 +1000223 wuffs_base__make_slice_u8((uint8_t*)arg, strlen(arg)),
224 WUFFS_BASE__PARSE_NUMBER_XXX__DEFAULT_OPTIONS);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100225 if (wuffs_base__status__is_ok(&u.status) && (u.value <= 0xFFFFFFFF)) {
Nigel Taod60815c2020-03-26 14:32:35 +1100226 g_flags.max_output_depth = (uint32_t)(u.value);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100227 continue;
228 }
Nigel Taod60815c2020-03-26 14:32:35 +1100229 return g_usage;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100230 }
Nigel Taoc766bb72020-07-09 12:59:32 +1000231 if (!strcmp(arg, "input-json-extra-comma")) {
232 g_flags.input_json_extra_comma = true;
233 continue;
234 }
Nigel Taoecadf722020-07-13 08:22:34 +1000235 if (!strcmp(arg, "strict-json-pointer-syntax")) {
Nigel Taod60815c2020-03-26 14:32:35 +1100236 g_flags.strict_json_pointer_syntax = true;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100237 continue;
238 }
239
Nigel Taod60815c2020-03-26 14:32:35 +1100240 return g_usage;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100241 }
242
Nigel Taod60815c2020-03-26 14:32:35 +1100243 g_flags.remaining_argc = argc - c;
244 g_flags.remaining_argv = argv + c;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100245 return "";
246}
247
Nigel Tao6b7ce302020-07-07 16:19:46 +1000248// ----
Nigel Taod0b16cb2020-03-14 10:15:54 +1100249
Nigel Taof3146c22020-03-26 08:47:42 +1100250#define WORK_BUFFER_ARRAY_SIZE \
251 WUFFS_JSON__DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE
252
Nigel Taod0b16cb2020-03-14 10:15:54 +1100253#ifndef SRC_BUFFER_ARRAY_SIZE
254#define SRC_BUFFER_ARRAY_SIZE (4 * 1024)
255#endif
256#ifndef TOKEN_BUFFER_ARRAY_SIZE
257#define TOKEN_BUFFER_ARRAY_SIZE (1 * 1024)
258#endif
259
260class TokenStream {
261 public:
262 struct Result {
263 std::string status_msg;
264 wuffs_base__token token;
265 // src_data is a sub-slice of m_src (a slice is a pointer-length pair).
266 // Calling TokenStream::peek or TokenStream::next may change the backing
267 // array's contents, so handling a TokenStream::Result may require copying
268 // this src_data slice's contents.
269 wuffs_base__slice_u8 src_data;
270
271 Result(std::string s)
272 : status_msg(s),
273 token(wuffs_base__make_token(0)),
274 src_data(wuffs_base__empty_slice_u8()) {}
275
276 Result(std::string s, wuffs_base__token t, wuffs_base__slice_u8 d)
277 : status_msg(s), token(t), src_data(d) {}
278 };
279
280 TokenStream(int input_file_descriptor)
281 : m_status(wuffs_base__make_status(nullptr)),
282 m_src(wuffs_base__make_io_buffer(
283 wuffs_base__make_slice_u8(m_src_array, SRC_BUFFER_ARRAY_SIZE),
284 wuffs_base__empty_io_buffer_meta())),
285 m_tok(wuffs_base__make_token_buffer(
286 wuffs_base__make_slice_token(m_tok_array, TOKEN_BUFFER_ARRAY_SIZE),
287 wuffs_base__empty_token_buffer_meta())),
288 m_input_file_descriptor(input_file_descriptor),
289 m_curr_token_end_src_index(0) {
290 m_status =
291 m_dec.initialize(sizeof__wuffs_json__decoder(), WUFFS_VERSION, 0);
Nigel Tao502c8ef2020-03-21 21:42:30 +1100292
Nigel Taoc766bb72020-07-09 12:59:32 +1000293 if (m_status.is_ok()) {
294 // Uncomment this line to enable the WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_X
295 // option, discussed in a separate comment.
296 // m_dec.set_quirk_enabled(WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_X, true);
297
298 if (g_flags.input_json_extra_comma) {
299 m_dec.set_quirk_enabled(WUFFS_JSON__QUIRK_ALLOW_EXTRA_COMMA, true);
300 }
301 }
Nigel Taod0b16cb2020-03-14 10:15:54 +1100302 }
303
304 Result peek() { return peek_or_next(false); }
305 Result next() { return peek_or_next(true); }
306
307 private:
308 Result peek_or_next(bool next) {
309 while (m_tok.meta.ri >= m_tok.meta.wi) {
310 if (m_status.repr == nullptr) {
311 // No-op.
312 } else if (m_status.repr == wuffs_base__suspension__short_read) {
313 if (m_curr_token_end_src_index != m_src.meta.ri) {
314 return Result(
315 "TokenStream: internal error: inconsistent src indexes");
316 }
317 const char* z = read_src();
318 m_curr_token_end_src_index = m_src.meta.ri;
319 if (z) {
320 return Result(z);
321 }
322 } else if (m_status.repr == wuffs_base__suspension__short_write) {
323 m_tok.compact();
324 } else {
325 return Result(m_status.message());
326 }
327
Nigel Taof3146c22020-03-26 08:47:42 +1100328 m_status =
329 m_dec.decode_tokens(&m_tok, &m_src,
330 wuffs_base__make_slice_u8(
331 m_work_buffer_array, WORK_BUFFER_ARRAY_SIZE));
Nigel Taod0b16cb2020-03-14 10:15:54 +1100332 }
333
334 wuffs_base__token t = m_tok.data.ptr[m_tok.meta.ri];
335 size_t i = m_curr_token_end_src_index;
336 uint64_t n = t.length();
337 if ((m_src.meta.ri < i) || ((m_src.meta.ri - i) < n)) {
338 return Result("TokenStream: internal error: inconsistent src indexes");
339 }
340 if (next) {
341 m_tok.meta.ri++;
342 m_curr_token_end_src_index += n;
343 }
344 return Result("", t, wuffs_base__make_slice_u8(m_src.data.ptr + i, n));
345 }
346
347 const char* //
348 read_src() {
349 if (m_src.meta.closed) {
350 return "main: internal error: read requested on a closed source";
351 }
352 m_src.compact();
353 if (m_src.meta.wi >= m_src.data.len) {
354 return "main: src buffer is full";
355 }
356 while (true) {
357 ssize_t n = read(m_input_file_descriptor, m_src.data.ptr + m_src.meta.wi,
358 m_src.data.len - m_src.meta.wi);
359 if (n >= 0) {
360 m_src.meta.wi += n;
361 m_src.meta.closed = n == 0;
362 break;
363 } else if (errno != EINTR) {
364 return strerror(errno);
365 }
366 }
367 return nullptr;
368 }
369
370 wuffs_base__status m_status;
371 wuffs_base__io_buffer m_src;
372 wuffs_base__token_buffer m_tok;
373 int m_input_file_descriptor;
374 // m_curr_token_end_src_index is the m_src.data.ptr index of the end of the
375 // current token. An invariant is that (m_curr_token_end_src_index <=
376 // m_src.meta.ri).
377 size_t m_curr_token_end_src_index;
378
379 wuffs_base__token m_tok_array[TOKEN_BUFFER_ARRAY_SIZE];
380 uint8_t m_src_array[SRC_BUFFER_ARRAY_SIZE];
Nigel Taof3146c22020-03-26 08:47:42 +1100381#if WORK_BUFFER_ARRAY_SIZE > 0
382 uint8_t m_work_buffer_array[WORK_BUFFER_ARRAY_SIZE];
383#else
384 // Not all C/C++ compilers support 0-length arrays.
385 uint8_t m_work_buffer_array[1];
386#endif
Nigel Taod0b16cb2020-03-14 10:15:54 +1100387 wuffs_json__decoder m_dec;
388};
389
390// ----
391
392class JsonThing {
393 public:
394 struct Result;
395
396 using Vector = std::vector<JsonThing>;
397
398 // We use a std::map in this example program to avoid dependencies outside of
399 // the C++ standard library. If you're copy/pasting this JsonThing code,
400 // consider a more efficient data structure such as an absl::btree_map.
401 //
402 // See CppCon 2014: Chandler Carruth "Efficiency with Algorithms, Performance
403 // with Data Structures" at https://www.youtube.com/watch?v=fHNmRkzxHWs
404 using Map = std::map<std::string, JsonThing>;
405
406 enum class Kind {
407 Null,
408 Bool,
409 Int64,
410 Float64,
411 String,
412 Array,
413 Object,
414 } kind = Kind::Null;
415
416 struct Value {
417 bool b = false;
418 int64_t i = 0;
419 double f = 0;
420 std::string s;
421 Vector a;
422 Map o;
423 } value;
424
425 static JsonThing::Result parse(TokenStream& ts);
426
427 private:
428 static JsonThing::Result parse_array(TokenStream& ts);
429 static JsonThing::Result parse_literal(TokenStream::Result tsr);
430 static JsonThing::Result parse_number(TokenStream::Result tsr);
431 static JsonThing::Result parse_object(TokenStream& ts);
432 static JsonThing::Result parse_string(TokenStream& ts,
433 TokenStream::Result tsr);
434};
435
436struct JsonThing::Result {
437 std::string status_msg;
438 JsonThing thing;
439
440 Result(std::string s) : status_msg(s), thing(JsonThing()) {}
441
442 Result(std::string s, JsonThing t) : status_msg(s), thing(t) {}
443};
444
445JsonThing::Result //
446JsonThing::parse(TokenStream& ts) {
447 while (true) {
448 TokenStream::Result tsr = ts.next();
449 if (!tsr.status_msg.empty()) {
450 return Result(std::move(tsr.status_msg));
451 }
452
Nigel Tao462f8662020-04-01 23:01:51 +1100453 int64_t vbc = tsr.token.value_base_category();
Nigel Taod0b16cb2020-03-14 10:15:54 +1100454 uint64_t vbd = tsr.token.value_base_detail();
455 switch (vbc) {
456 case WUFFS_BASE__TOKEN__VBC__FILLER:
457 continue;
458 case WUFFS_BASE__TOKEN__VBC__STRUCTURE:
459 if (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH) {
460 if (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) {
461 return parse_array(ts);
462 } else if (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT) {
463 return parse_object(ts);
464 }
465 }
466 break;
467 case WUFFS_BASE__TOKEN__VBC__STRING:
468 return parse_string(ts, tsr);
469 case WUFFS_BASE__TOKEN__VBC__LITERAL:
470 return parse_literal(tsr);
Nigel Tao345f1aa2020-04-05 22:10:37 +1000471 case WUFFS_BASE__TOKEN__VBC__NUMBER:
Nigel Taod0b16cb2020-03-14 10:15:54 +1100472 return parse_number(tsr);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100473 }
474
475 return Result("main: internal error: unexpected token");
476 }
477}
478
479JsonThing::Result //
480JsonThing::parse_array(TokenStream& ts) {
481 JsonThing jt;
482 jt.kind = Kind::Array;
483 while (true) {
484 TokenStream::Result tsr = ts.peek();
485 if (!tsr.status_msg.empty()) {
486 return Result(std::move(tsr.status_msg));
487 }
Nigel Tao462f8662020-04-01 23:01:51 +1100488 int64_t vbc = tsr.token.value_base_category();
Nigel Taod0b16cb2020-03-14 10:15:54 +1100489 uint64_t vbd = tsr.token.value_base_detail();
490 if (vbc == WUFFS_BASE__TOKEN__VBC__FILLER) {
491 ts.next();
492 continue;
493 } else if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
494 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
495 ts.next();
496 break;
497 }
498
499 JsonThing::Result jtr = JsonThing::parse(ts);
500 if (!jtr.status_msg.empty()) {
501 return Result(std::move(jtr.status_msg));
502 }
503 jt.value.a.push_back(std::move(jtr.thing));
504 }
505 return Result("", jt);
506}
507
508JsonThing::Result //
509JsonThing::parse_literal(TokenStream::Result tsr) {
510 uint64_t vbd = tsr.token.value_base_detail();
511 if (vbd & WUFFS_BASE__TOKEN__VBD__LITERAL__NULL) {
512 JsonThing jt;
513 jt.kind = Kind::Null;
514 return Result("", jt);
515 } else if (vbd & WUFFS_BASE__TOKEN__VBD__LITERAL__FALSE) {
516 JsonThing jt;
517 jt.kind = Kind::Bool;
518 jt.value.b = false;
519 return Result("", jt);
520 } else if (vbd & WUFFS_BASE__TOKEN__VBD__LITERAL__TRUE) {
521 JsonThing jt;
522 jt.kind = Kind::Bool;
523 jt.value.b = true;
524 return Result("", jt);
525 }
526 return Result("main: internal error: unexpected token");
527}
528
529JsonThing::Result //
530JsonThing::parse_number(TokenStream::Result tsr) {
531 // Parsing the number from its string representation (converting from "123"
532 // to 123) isn't necessary for the jsonfindptrs program, but if you're
533 // copy/pasting this JsonThing code, here's how to do it.
534 uint64_t vbd = tsr.token.value_base_detail();
535 if (vbd & WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_TEXT) {
536 if (vbd & WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED) {
537 static constexpr int64_t m = 0x001FFFFFFFFFFFFF; // ((1<<53) - 1).
Nigel Tao6b7ce302020-07-07 16:19:46 +1000538 wuffs_base__result_i64 r = wuffs_base__parse_number_i64(
539 tsr.src_data, WUFFS_BASE__PARSE_NUMBER_XXX__DEFAULT_OPTIONS);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100540 if (!r.status.is_ok()) {
541 return Result(r.status.message());
542 } else if ((r.value < -m) || (+m < r.value)) {
543 return Result(wuffs_base__error__out_of_bounds);
544 }
545 JsonThing jt;
546 jt.kind = Kind::Int64;
547 jt.value.i = r.value;
548 return Result("", jt);
549 } else if (vbd & WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT) {
Nigel Tao6b7ce302020-07-07 16:19:46 +1000550 wuffs_base__result_f64 r = wuffs_base__parse_number_f64(
551 tsr.src_data, WUFFS_BASE__PARSE_NUMBER_XXX__DEFAULT_OPTIONS);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100552 if (!r.status.is_ok()) {
553 return Result(r.status.message());
554 }
555 JsonThing jt;
556 jt.kind = Kind::Float64;
557 jt.value.f = r.value;
558 return Result("", jt);
559 }
560 }
561 return Result("main: internal error: unexpected number");
562}
563
564JsonThing::Result //
565JsonThing::parse_object(TokenStream& ts) {
566 JsonThing jt;
567 jt.kind = Kind::Object;
568
569 std::string key;
570 bool have_key = false;
571
572 while (true) {
573 TokenStream::Result tsr = ts.peek();
574 if (!tsr.status_msg.empty()) {
575 return Result(std::move(tsr.status_msg));
576 }
Nigel Tao462f8662020-04-01 23:01:51 +1100577 int64_t vbc = tsr.token.value_base_category();
Nigel Taod0b16cb2020-03-14 10:15:54 +1100578 uint64_t vbd = tsr.token.value_base_detail();
579 if (vbc == WUFFS_BASE__TOKEN__VBC__FILLER) {
580 ts.next();
581 continue;
582 } else if ((vbc == WUFFS_BASE__TOKEN__VBC__STRUCTURE) &&
583 (vbd & WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP)) {
584 ts.next();
585 break;
586 }
587
588 JsonThing::Result jtr = JsonThing::parse(ts);
589 if (!jtr.status_msg.empty()) {
590 return Result(std::move(jtr.status_msg));
591 }
592
593 if (have_key) {
594 have_key = false;
595 auto iter = jt.value.o.find(key);
596 if (iter == jt.value.o.end()) {
597 jt.value.o.insert(
598 iter, Map::value_type(std::move(key), std::move(jtr.thing)));
599 } else {
600 return Result("main: duplicate key: " + key);
601 }
602 } else if (jtr.thing.kind == Kind::String) {
603 have_key = true;
604 key = std::move(jtr.thing.value.s);
605 } else {
606 return Result("main: internal error: unexpected non-string key");
607 }
608 }
Nigel Taob5d6f8d2020-04-06 20:57:40 +1000609 if (have_key) {
610 return Result("main: internal error: unpaired key");
611 }
Nigel Taod0b16cb2020-03-14 10:15:54 +1100612 return Result("", jt);
613}
614
615JsonThing::Result //
616JsonThing::parse_string(TokenStream& ts, TokenStream::Result tsr) {
617 JsonThing jt;
618 jt.kind = Kind::String;
619 while (true) {
Nigel Tao462f8662020-04-01 23:01:51 +1100620 int64_t vbc = tsr.token.value_base_category();
Nigel Taod0b16cb2020-03-14 10:15:54 +1100621 uint64_t vbd = tsr.token.value_base_detail();
622
623 switch (vbc) {
624 case WUFFS_BASE__TOKEN__VBC__STRING: {
625 if (vbd & WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP) {
626 // No-op.
Nigel Tao502c8ef2020-03-21 21:42:30 +1100627
Nigel Taod0b16cb2020-03-14 10:15:54 +1100628 } else if (vbd &
629 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY) {
630 const char* ptr = // Convert from (uint8_t*).
631 static_cast<const char*>(static_cast<void*>(tsr.src_data.ptr));
632 jt.value.s.append(ptr, tsr.src_data.len);
Nigel Tao502c8ef2020-03-21 21:42:30 +1100633
634 } else if (
635 vbd &
636 WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_4_SRC_BACKSLASH_X) {
637 // We shouldn't get here unless we enable the
638 // WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_X option. The jsonfindptrs
639 // program doesn't enable that by default, but if you're copy/pasting
640 // this JsonThing code and your program does enable that option,
641 // here's how to handle it.
Nigel Taodf3acfb2020-04-08 11:08:01 +1000642 //
643 // As per the quirk documentation, there are two options for how to
644 // interpret a backslash-x: as a byte or as a Unicode code point.
645 // This implementation chooses as a byte.
Nigel Tao502c8ef2020-03-21 21:42:30 +1100646 wuffs_base__slice_u8 encoded = tsr.src_data;
647 if (encoded.len & 3) {
648 return Result(
649 "main: internal error: \\x token length not a multiple of 4",
650 JsonThing());
651 }
652 while (encoded.len) {
653 uint8_t decoded[64];
Nigel Taoe0ca5a42020-07-09 21:09:19 +1000654 const bool src_closed = true;
655 wuffs_base__transform__output o = wuffs_base__base_16__decode4(
656 wuffs_base__make_slice_u8(&decoded[0], 64), encoded, src_closed,
657 WUFFS_BASE__BASE_16__DEFAULT_OPTIONS);
658 if (o.status.is_error()) {
659 return Result(o.status.message(), JsonThing());
660 } else if ((o.num_dst > 64) || (o.num_src > encoded.len)) {
Nigel Tao502c8ef2020-03-21 21:42:30 +1100661 return Result(
662 "main: internal error: inconsistent hexadecimal decoding",
663 JsonThing());
664 }
665 const char* ptr = // Convert from (uint8_t*).
666 static_cast<const char*>(static_cast<void*>(&decoded[0]));
Nigel Taoe0ca5a42020-07-09 21:09:19 +1000667 jt.value.s.append(ptr, o.num_dst);
668 encoded.ptr += o.num_src;
669 encoded.len -= o.num_src;
Nigel Tao502c8ef2020-03-21 21:42:30 +1100670 }
671
Nigel Taod0b16cb2020-03-14 10:15:54 +1100672 } else {
673 return Result(
674 "main: internal error: unexpected string-token conversion",
675 JsonThing());
676 }
677 break;
678 }
679
680 case WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT: {
681 uint8_t u[WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL];
682 size_t n = wuffs_base__utf_8__encode(
683 wuffs_base__make_slice_u8(&u[0],
684 WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL),
685 vbd);
686 const char* ptr = // Convert from (uint8_t*).
687 static_cast<const char*>(static_cast<void*>(&u[0]));
688 jt.value.s.append(ptr, n);
689 break;
690 }
691
692 default:
693 return Result("main: internal error: unexpected token");
694 }
695
Nigel Tao496e88b2020-04-09 22:10:08 +1000696 if (!tsr.token.continued()) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100697 break;
698 }
699 tsr = ts.next();
700 if (!tsr.status_msg.empty()) {
701 return Result(std::move(tsr.status_msg));
702 }
703 }
704 return Result("", jt);
705}
706
707// ----
708
709std::string //
710escape(std::string s) {
711 for (char& c : s) {
712 if ((c == '~') || (c == '/') || (c == '\n') || (c == '\r')) {
713 goto escape_needed;
714 }
715 }
716 return s;
717
718escape_needed:
719 std::string e;
720 e.reserve(8 + s.length());
721 for (char& c : s) {
722 switch (c) {
723 case '~':
724 e += "~0";
725 break;
726 case '/':
727 e += "~1";
728 break;
729 case '\n':
Nigel Taod60815c2020-03-26 14:32:35 +1100730 if (g_flags.strict_json_pointer_syntax) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100731 return "";
732 }
733 e += "~n";
734 break;
735 case '\r':
Nigel Taod60815c2020-03-26 14:32:35 +1100736 if (g_flags.strict_json_pointer_syntax) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100737 return "";
738 }
739 e += "~r";
740 break;
741 default:
742 e += c;
743 break;
744 }
745 }
746 return e;
747}
748
749std::string //
750print_json_pointers(JsonThing& jt, std::string s, uint32_t depth) {
751 std::cout << s << std::endl;
Nigel Taod60815c2020-03-26 14:32:35 +1100752 if (depth++ >= g_flags.max_output_depth) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100753 return "";
754 }
755
756 switch (jt.kind) {
757 case JsonThing::Kind::Array:
758 s += "/";
759 for (size_t i = 0; i < jt.value.a.size(); i++) {
760 TRY(print_json_pointers(jt.value.a[i], s + std::to_string(i), depth));
761 }
762 break;
763 case JsonThing::Kind::Object:
764 s += "/";
765 for (auto& kv : jt.value.o) {
766 std::string e = escape(kv.first);
767 if (e.empty() && !kv.first.empty()) {
768 return "main: unsupported \"\\u000A\" or \"\\u000D\" in object key";
769 }
770 TRY(print_json_pointers(kv.second, s + e, depth));
771 }
772 break;
Nigel Tao18ef5b42020-03-16 10:37:47 +1100773 default:
774 break;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100775 }
776 return "";
777}
778
779std::string //
780main1(int argc, char** argv) {
781 TRY(parse_flags(argc, argv));
782
783 int input_file_descriptor = 0; // A 0 default means stdin.
Nigel Taod60815c2020-03-26 14:32:35 +1100784 if (g_flags.remaining_argc > 1) {
785 return g_usage;
786 } else if (g_flags.remaining_argc == 1) {
787 const char* arg = g_flags.remaining_argv[0];
Nigel Taod0b16cb2020-03-14 10:15:54 +1100788 input_file_descriptor = open(arg, O_RDONLY);
789 if (input_file_descriptor < 0) {
790 return std::string("main: cannot read ") + arg + ": " + strerror(errno);
791 }
792 }
793
794 TokenStream ts(input_file_descriptor);
795 JsonThing::Result jtr = JsonThing::parse(ts);
796 if (!jtr.status_msg.empty()) {
797 return jtr.status_msg;
798 }
799 return print_json_pointers(jtr.thing, "", 0);
800}
801
802// ----
803
804int //
805compute_exit_code(std::string status_msg) {
806 if (status_msg.empty()) {
807 return 0;
808 }
809 std::cerr << status_msg << std::endl;
810 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
811 // formatted or unsupported input.
812 //
813 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
814 // run-time checks found that an internal invariant did not hold.
815 //
816 // Automated testing, including badly formatted inputs, can therefore
817 // discriminate between expected failure (exit code 1) and unexpected failure
818 // (other non-zero exit codes). Specifically, exit code 2 for internal
819 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
820 // linux) for a segmentation fault (e.g. null pointer dereference).
821 return (status_msg.find("internal error:") != std::string::npos) ? 2 : 1;
822}
823
824int //
825main(int argc, char** argv) {
826 std::string z = main1(argc, argv);
827 int exit_code = compute_exit_code(z);
828 return exit_code;
829}