blob: b21ea0a79a1b7a3af3d9a222275addbe5636b233 [file] [log] [blame]
Nigel Taod0b16cb2020-03-14 10:15:54 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
18jsonfindptrs reads UTF-8 JSON from stdin and writes every node's JSON Pointer
19(RFC 6901) to stdout.
20
Nigel Taod60815c2020-03-26 14:32:35 +110021See the "const char* g_usage" string below for details.
Nigel Taod0b16cb2020-03-14 10:15:54 +110022
23----
24
25This program uses Wuffs' JSON decoder at a relatively high level, building
26in-memory representations of JSON 'things' (e.g. numbers, strings, objects).
27After the entire input has been converted, walking the tree prints the output
Nigel Taocf6c5782020-08-03 23:43:45 +100028(in sorted order). The wuffs_aux::DecodeJson library function converts the
29lower level token stream to higher level callbacks. This .cc file deals only
30with those callbacks, not with tokens per se.
Nigel Taod0b16cb2020-03-14 10:15:54 +110031
32This approach is centered around JSON things. Each JSON thing comprises one or
33more JSON tokens.
34
35An alternative, lower-level approach is in the sibling example/jsonptr program.
36Neither approach is better or worse per se, but when studying this program, be
37aware that there are multiple ways to use Wuffs' JSON decoder.
38
39The two programs, jsonfindptrs and jsonptr, also demonstrate different
40trade-offs with regard to JSON object duplicate keys. The JSON spec permits
41different implementations to allow or reject duplicate keys. It is not always
42clear which approach is safer. Rejecting them is certainly unambiguous, and
43security bugs can lurk in ambiguous corners of a file format, if two different
44implementations both silently accept a file but differ on how to interpret it.
45On the other hand, in the worst case, detecting duplicate keys requires O(N)
46memory, where N is the size of the (potentially untrusted) input.
47
48This program (jsonfindptrs) rejects duplicate keys.
49
50----
51
52This example program differs from most other example Wuffs programs in that it
53is written in C++, not C.
54
55$CXX jsonfindptrs.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
56
57for a C++ compiler $CXX, such as clang++ or g++.
58*/
59
Nigel Tao721190a2020-04-03 22:25:21 +110060#if defined(__cplusplus) && (__cplusplus < 201103L)
61#error "This C++ program requires -std=c++11 or later"
62#endif
63
Nigel Taocf6c5782020-08-03 23:43:45 +100064#include <stdio.h>
Nigel Tao6b7ce302020-07-07 16:19:46 +100065
Nigel Taod0b16cb2020-03-14 10:15:54 +110066#include <iostream>
67#include <map>
68#include <string>
69#include <vector>
70
71// Wuffs ships as a "single file C library" or "header file library" as per
72// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
73//
74// To use that single file as a "foo.c"-like implementation, instead of a
75// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
76// compiling it.
77#define WUFFS_IMPLEMENTATION
78
79// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
80// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
81// the entire Wuffs standard library, implementing a variety of codecs and file
82// formats. Without this macro definition, an optimizing compiler or linker may
83// very well discard Wuffs code for unused codecs, but listing the Wuffs
84// modules we use makes that process explicit. Preprocessing means that such
85// code simply isn't compiled.
86#define WUFFS_CONFIG__MODULES
Nigel Taocf6c5782020-08-03 23:43:45 +100087#define WUFFS_CONFIG__MODULE__AUX__BASE
88#define WUFFS_CONFIG__MODULE__AUX__JSON
Nigel Taod0b16cb2020-03-14 10:15:54 +110089#define WUFFS_CONFIG__MODULE__BASE
90#define WUFFS_CONFIG__MODULE__JSON
91
92// If building this program in an environment that doesn't easily accommodate
93// relative includes, you can use the script/inline-c-relative-includes.go
94// program to generate a stand-alone C++ file.
95#include "../../release/c/wuffs-unsupported-snapshot.c"
96
97#define TRY(error_msg) \
98 do { \
99 std::string z = error_msg; \
100 if (!z.empty()) { \
101 return z; \
102 } \
103 } while (false)
104
Nigel Taod60815c2020-03-26 14:32:35 +1100105static const char* g_usage =
Nigel Taod0b16cb2020-03-14 10:15:54 +1100106 "Usage: jsonfindptrs -flags input.json\n"
107 "\n"
108 "Flags:\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100109 " -d=NUM -max-output-depth=NUM\n"
Nigel Taoc766bb72020-07-09 12:59:32 +1000110 " -input-json-extra-comma\n"
Nigel Taoecadf722020-07-13 08:22:34 +1000111 " -strict-json-pointer-syntax\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100112 "\n"
113 "The input.json filename is optional. If absent, it reads from stdin.\n"
114 "\n"
115 "----\n"
116 "\n"
117 "jsonfindptrs reads UTF-8 JSON from stdin and writes every node's JSON\n"
118 "Pointer (RFC 6901) to stdout.\n"
119 "\n"
120 "For example, given RFC 6901 section 5's sample input\n"
121 "(https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
122 " jsonfindptrs rfc-6901-json-pointer.json\n"
123 "will print:\n"
124 " \n"
125 " /\n"
126 " / \n"
127 " /a~1b\n"
128 " /c%d\n"
129 " /e^f\n"
130 " /foo\n"
131 " /foo/0\n"
132 " /foo/1\n"
133 " /g|h\n"
134 " /i\\j\n"
135 " /k\"l\n"
136 " /m~0n\n"
137 "\n"
138 "The first three lines are (1) a 0-byte \"\", (2) a 1-byte \"/\" and (3)\n"
139 "a 2-byte \"/ \". Unlike a file system, the \"/\" JSON Pointer does not\n"
140 "identify the root. Instead, \"\" is the root and \"/\" is the child (the\n"
141 "value in a key-value pair) of the root whose key is the empty string.\n"
142 "Similarly, \"/xyz\" and \"/xyz/\" are two different nodes.\n"
143 "\n"
144 "----\n"
145 "\n"
146 "The JSON specification (https://json.org/) permits implementations that\n"
147 "allow duplicate keys, but this one does not. Conversely, it prints keys\n"
148 "in sorted order, but the overall output is not necessarily sorted\n"
149 "lexicographically. For example, \"/a/9\" would come before \"/a/10\",\n"
150 "and \"/b/c\", a child of \"/b\", would come before \"/b+\".\n"
151 "\n"
152 "This JSON implementation also rejects integer values outside ±M, where\n"
153 "M is ((1<<53)-1), also known as JavaScript's Number.MAX_SAFE_INTEGER.\n"
154 "\n"
Nigel Taoc766bb72020-07-09 12:59:32 +1000155 "The -input-json-extra-comma flag allows input like \"[1,2,]\", with a\n"
156 "comma after the final element of a JSON list or dictionary.\n"
157 "\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100158 "----\n"
159 "\n"
Nigel Taoecadf722020-07-13 08:22:34 +1000160 "The -strict-json-pointer-syntax flag restricts the output lines to\n"
161 "exactly RFC 6901, with only two escape sequences: \"~0\" and \"~1\" for\n"
162 "\"~\" and \"/\". Without this flag, this program also lets \"~n\" and\n"
163 "\"~r\" escape the New Line and Carriage Return ASCII control characters,\n"
164 "which can work better with line oriented Unix tools that assume exactly\n"
165 "one value (i.e. one JSON Pointer string) per line. With this flag, it\n"
166 "fails if the input JSON's keys contain \"\\u000A\" or \"\\u000D\".\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100167 "\n"
168 "----\n"
169 "\n"
170 "The JSON specification permits implementations to set their own maximum\n"
171 "input depth. This JSON implementation sets it to 1024.\n"
172 "\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100173 "The -d=NUM or -max-output-depth=NUM flag gives the maximum (inclusive)\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100174 "output depth. JSON containers ([] arrays and {} objects) can hold other\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100175 "containers. A bare -d or -max-output-depth is equivalent to -d=1,\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100176 "analogous to the Unix ls command. The flag's absence is equivalent to an\n"
177 "unlimited output depth, analogous to the Unix find command (and hence\n"
178 "the name of this program: jsonfindptrs).";
179
180// ----
181
Nigel Taocf6c5782020-08-03 23:43:45 +1000182std::vector<uint32_t> g_quirks;
183
Nigel Taod0b16cb2020-03-14 10:15:54 +1100184struct {
185 int remaining_argc;
186 char** remaining_argv;
187
Nigel Taoc766bb72020-07-09 12:59:32 +1000188 bool input_json_extra_comma;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100189 uint32_t max_output_depth;
190 bool strict_json_pointer_syntax;
Nigel Taod60815c2020-03-26 14:32:35 +1100191} g_flags = {0};
Nigel Taod0b16cb2020-03-14 10:15:54 +1100192
193std::string //
194parse_flags(int argc, char** argv) {
Nigel Taod60815c2020-03-26 14:32:35 +1100195 g_flags.max_output_depth = 0xFFFFFFFF;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100196
197 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
198 for (; c < argc; c++) {
199 char* arg = argv[c];
200 if (*arg++ != '-') {
201 break;
202 }
203
204 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
205 // cases, a bare "-" is not a flag (some programs may interpret it as
206 // stdin) and a bare "--" means to stop parsing flags.
207 if (*arg == '\x00') {
208 break;
209 } else if (*arg == '-') {
210 arg++;
211 if (*arg == '\x00') {
212 c++;
213 break;
214 }
215 }
216
Nigel Tao94440cf2020-04-02 22:28:24 +1100217 if (!strcmp(arg, "d") || !strcmp(arg, "max-output-depth")) {
Nigel Taod60815c2020-03-26 14:32:35 +1100218 g_flags.max_output_depth = 1;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100219 continue;
Nigel Tao94440cf2020-04-02 22:28:24 +1100220 } else if (!strncmp(arg, "d=", 2) ||
Nigel Taod0b16cb2020-03-14 10:15:54 +1100221 !strncmp(arg, "max-output-depth=", 16)) {
222 while (*arg++ != '=') {
223 }
224 wuffs_base__result_u64 u = wuffs_base__parse_number_u64(
Nigel Tao6b7ce302020-07-07 16:19:46 +1000225 wuffs_base__make_slice_u8((uint8_t*)arg, strlen(arg)),
226 WUFFS_BASE__PARSE_NUMBER_XXX__DEFAULT_OPTIONS);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100227 if (wuffs_base__status__is_ok(&u.status) && (u.value <= 0xFFFFFFFF)) {
Nigel Taod60815c2020-03-26 14:32:35 +1100228 g_flags.max_output_depth = (uint32_t)(u.value);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100229 continue;
230 }
Nigel Taod60815c2020-03-26 14:32:35 +1100231 return g_usage;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100232 }
Nigel Taoc766bb72020-07-09 12:59:32 +1000233 if (!strcmp(arg, "input-json-extra-comma")) {
Nigel Taocf6c5782020-08-03 23:43:45 +1000234 g_quirks.push_back(WUFFS_JSON__QUIRK_ALLOW_EXTRA_COMMA);
Nigel Taoc766bb72020-07-09 12:59:32 +1000235 continue;
236 }
Nigel Taoecadf722020-07-13 08:22:34 +1000237 if (!strcmp(arg, "strict-json-pointer-syntax")) {
Nigel Taod60815c2020-03-26 14:32:35 +1100238 g_flags.strict_json_pointer_syntax = true;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100239 continue;
240 }
241
Nigel Taod60815c2020-03-26 14:32:35 +1100242 return g_usage;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100243 }
244
Nigel Taod60815c2020-03-26 14:32:35 +1100245 g_flags.remaining_argc = argc - c;
246 g_flags.remaining_argv = argv + c;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100247 return "";
248}
249
Nigel Tao6b7ce302020-07-07 16:19:46 +1000250// ----
Nigel Taod0b16cb2020-03-14 10:15:54 +1100251
Nigel Taod0b16cb2020-03-14 10:15:54 +1100252class JsonThing {
253 public:
Nigel Taod0b16cb2020-03-14 10:15:54 +1100254 using Vector = std::vector<JsonThing>;
255
256 // We use a std::map in this example program to avoid dependencies outside of
257 // the C++ standard library. If you're copy/pasting this JsonThing code,
258 // consider a more efficient data structure such as an absl::btree_map.
259 //
260 // See CppCon 2014: Chandler Carruth "Efficiency with Algorithms, Performance
261 // with Data Structures" at https://www.youtube.com/watch?v=fHNmRkzxHWs
262 using Map = std::map<std::string, JsonThing>;
263
264 enum class Kind {
265 Null,
266 Bool,
267 Int64,
268 Float64,
269 String,
270 Array,
271 Object,
272 } kind = Kind::Null;
273
274 struct Value {
275 bool b = false;
276 int64_t i = 0;
277 double f = 0;
278 std::string s;
279 Vector a;
280 Map o;
281 } value;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100282};
283
Nigel Taod0b16cb2020-03-14 10:15:54 +1100284// ----
285
286std::string //
287escape(std::string s) {
288 for (char& c : s) {
289 if ((c == '~') || (c == '/') || (c == '\n') || (c == '\r')) {
290 goto escape_needed;
291 }
292 }
293 return s;
294
295escape_needed:
296 std::string e;
297 e.reserve(8 + s.length());
298 for (char& c : s) {
299 switch (c) {
300 case '~':
301 e += "~0";
302 break;
303 case '/':
304 e += "~1";
305 break;
306 case '\n':
Nigel Taod60815c2020-03-26 14:32:35 +1100307 if (g_flags.strict_json_pointer_syntax) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100308 return "";
309 }
310 e += "~n";
311 break;
312 case '\r':
Nigel Taod60815c2020-03-26 14:32:35 +1100313 if (g_flags.strict_json_pointer_syntax) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100314 return "";
315 }
316 e += "~r";
317 break;
318 default:
319 e += c;
320 break;
321 }
322 }
323 return e;
324}
325
326std::string //
327print_json_pointers(JsonThing& jt, std::string s, uint32_t depth) {
328 std::cout << s << std::endl;
Nigel Taod60815c2020-03-26 14:32:35 +1100329 if (depth++ >= g_flags.max_output_depth) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100330 return "";
331 }
332
333 switch (jt.kind) {
334 case JsonThing::Kind::Array:
335 s += "/";
336 for (size_t i = 0; i < jt.value.a.size(); i++) {
337 TRY(print_json_pointers(jt.value.a[i], s + std::to_string(i), depth));
338 }
339 break;
340 case JsonThing::Kind::Object:
341 s += "/";
342 for (auto& kv : jt.value.o) {
343 std::string e = escape(kv.first);
344 if (e.empty() && !kv.first.empty()) {
345 return "main: unsupported \"\\u000A\" or \"\\u000D\" in object key";
346 }
347 TRY(print_json_pointers(kv.second, s + e, depth));
348 }
349 break;
Nigel Tao18ef5b42020-03-16 10:37:47 +1100350 default:
351 break;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100352 }
353 return "";
354}
355
Nigel Taocf6c5782020-08-03 23:43:45 +1000356// ----
357
358class Callbacks : public wuffs_aux::DecodeJsonCallbacks {
359 public:
360 struct Entry {
361 Entry(JsonThing&& jt)
362 : thing(std::move(jt)), has_map_key(false), map_key() {}
363
364 JsonThing thing;
365 bool has_map_key;
366 std::string map_key;
367 };
368
369 Callbacks() = default;
370
371 std::string Append(JsonThing&& jt) {
372 if (m_stack.empty()) {
373 m_stack.push_back(Entry(std::move(jt)));
374 return "";
375 }
376 Entry& top = m_stack.back();
377 switch (top.thing.kind) {
378 case JsonThing::Kind::Array:
379 top.thing.value.a.push_back(std::move(jt));
380 return "";
381 case JsonThing::Kind::Object:
382 if (top.has_map_key) {
383 top.has_map_key = false;
384 auto iter = top.thing.value.o.find(top.map_key);
385 if (iter != top.thing.value.o.end()) {
386 return "main: duplicate key: " + top.map_key;
387 }
388 top.thing.value.o.insert(
389 iter, JsonThing::Map::value_type(std::move(top.map_key),
390 std::move(jt)));
391 return "";
392 } else if (jt.kind == JsonThing::Kind::String) {
393 top.has_map_key = true;
394 top.map_key = std::move(jt.value.s);
395 return "";
396 }
397 return "main: internal error: non-string map key";
398 }
399 return "main: internal error: non-container stack entry";
400 }
401
402 virtual std::string AppendNull() {
403 JsonThing jt;
404 jt.kind = JsonThing::Kind::Null;
405 return Append(std::move(jt));
406 }
407
408 virtual std::string AppendBool(bool val) {
409 JsonThing jt;
410 jt.kind = JsonThing::Kind::Bool;
411 jt.value.b = val;
412 return Append(std::move(jt));
413 }
414
415 virtual std::string AppendI64(int64_t val) {
416 JsonThing jt;
417 jt.kind = JsonThing::Kind::Int64;
418 jt.value.i = val;
419 return Append(std::move(jt));
420 }
421
422 virtual std::string AppendF64(double val) {
423 JsonThing jt;
424 jt.kind = JsonThing::Kind::Float64;
425 jt.value.f = val;
426 return Append(std::move(jt));
427 }
428
429 virtual std::string AppendString(std::string&& val) {
430 JsonThing jt;
431 jt.kind = JsonThing::Kind::String;
432 jt.value.s = std::move(val);
433 return Append(std::move(jt));
434 }
435
436 virtual std::string Push(uint32_t flags) {
437 if (flags & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) {
438 JsonThing jt;
439 jt.kind = JsonThing::Kind::Array;
440 m_stack.push_back(std::move(jt));
441 return "";
442 } else if (flags & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT) {
443 JsonThing jt;
444 jt.kind = JsonThing::Kind::Object;
445 m_stack.push_back(std::move(jt));
446 return "";
447 }
448 return "main: internal error: bad push";
449 }
450
451 virtual std::string Pop(uint32_t flags) {
452 if (m_stack.empty()) {
453 return "main: internal error: bad pop";
454 }
455 JsonThing jt = std::move(m_stack.back().thing);
456 m_stack.pop_back();
457 return Append(std::move(jt));
458 }
459
460 virtual void Done(wuffs_aux::DecodeJsonResult& result,
461 wuffs_aux::sync_io::Input& input,
462 wuffs_aux::IOBuffer& buffer) {
463 if (!result.error_message.empty()) {
464 return;
465 } else if (m_stack.size() != 1) {
466 result.error_message = "main: internal error: bad depth";
467 return;
468 }
469 result.error_message = print_json_pointers(m_stack.back().thing, "", 0);
470 }
471
472 private:
473 std::vector<Entry> m_stack;
474};
475
476// ----
477
Nigel Taod0b16cb2020-03-14 10:15:54 +1100478std::string //
479main1(int argc, char** argv) {
480 TRY(parse_flags(argc, argv));
481
Nigel Taocf6c5782020-08-03 23:43:45 +1000482 FILE* in = stdin;
Nigel Taod60815c2020-03-26 14:32:35 +1100483 if (g_flags.remaining_argc > 1) {
484 return g_usage;
485 } else if (g_flags.remaining_argc == 1) {
Nigel Taocf6c5782020-08-03 23:43:45 +1000486 in = fopen(g_flags.remaining_argv[0], "r");
487 if (!in) {
488 return std::string("main: cannot read input file");
Nigel Taod0b16cb2020-03-14 10:15:54 +1100489 }
490 }
491
Nigel Taocf6c5782020-08-03 23:43:45 +1000492 return wuffs_aux::DecodeJson(
493 Callbacks(), wuffs_aux::sync_io::FileInput(in),
494 wuffs_base__make_slice_u32(g_quirks.data(), g_quirks.size()))
495 .error_message;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100496}
497
498// ----
499
500int //
501compute_exit_code(std::string status_msg) {
502 if (status_msg.empty()) {
503 return 0;
504 }
505 std::cerr << status_msg << std::endl;
506 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
507 // formatted or unsupported input.
508 //
509 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
510 // run-time checks found that an internal invariant did not hold.
511 //
512 // Automated testing, including badly formatted inputs, can therefore
513 // discriminate between expected failure (exit code 1) and unexpected failure
514 // (other non-zero exit codes). Specifically, exit code 2 for internal
515 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
516 // linux) for a segmentation fault (e.g. null pointer dereference).
517 return (status_msg.find("internal error:") != std::string::npos) ? 2 : 1;
518}
519
520int //
521main(int argc, char** argv) {
522 std::string z = main1(argc, argv);
523 int exit_code = compute_exit_code(z);
524 return exit_code;
525}