blob: b5dcef1bf4f45b32e95808d12263a7198258467f [file] [log] [blame]
Nigel Taod0b16cb2020-03-14 10:15:54 +11001// Copyright 2020 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// ----------------
16
17/*
18jsonfindptrs reads UTF-8 JSON from stdin and writes every node's JSON Pointer
19(RFC 6901) to stdout.
20
Nigel Taod60815c2020-03-26 14:32:35 +110021See the "const char* g_usage" string below for details.
Nigel Taod0b16cb2020-03-14 10:15:54 +110022
23----
24
25This program uses Wuffs' JSON decoder at a relatively high level, building
26in-memory representations of JSON 'things' (e.g. numbers, strings, objects).
27After the entire input has been converted, walking the tree prints the output
Nigel Taocf6c5782020-08-03 23:43:45 +100028(in sorted order). The wuffs_aux::DecodeJson library function converts the
29lower level token stream to higher level callbacks. This .cc file deals only
30with those callbacks, not with tokens per se.
Nigel Taod0b16cb2020-03-14 10:15:54 +110031
32This approach is centered around JSON things. Each JSON thing comprises one or
33more JSON tokens.
34
35An alternative, lower-level approach is in the sibling example/jsonptr program.
36Neither approach is better or worse per se, but when studying this program, be
37aware that there are multiple ways to use Wuffs' JSON decoder.
38
39The two programs, jsonfindptrs and jsonptr, also demonstrate different
40trade-offs with regard to JSON object duplicate keys. The JSON spec permits
41different implementations to allow or reject duplicate keys. It is not always
42clear which approach is safer. Rejecting them is certainly unambiguous, and
43security bugs can lurk in ambiguous corners of a file format, if two different
44implementations both silently accept a file but differ on how to interpret it.
45On the other hand, in the worst case, detecting duplicate keys requires O(N)
46memory, where N is the size of the (potentially untrusted) input.
47
48This program (jsonfindptrs) rejects duplicate keys.
49
50----
51
Nigel Tao50bfab92020-08-05 11:39:09 +100052To run:
Nigel Taod0b16cb2020-03-14 10:15:54 +110053
54$CXX jsonfindptrs.cc && ./a.out < ../../test/data/github-tags.json; rm -f a.out
55
56for a C++ compiler $CXX, such as clang++ or g++.
57*/
58
Nigel Tao721190a2020-04-03 22:25:21 +110059#if defined(__cplusplus) && (__cplusplus < 201103L)
60#error "This C++ program requires -std=c++11 or later"
61#endif
62
Nigel Taocf6c5782020-08-03 23:43:45 +100063#include <stdio.h>
Nigel Tao6b7ce302020-07-07 16:19:46 +100064
Nigel Taod0b16cb2020-03-14 10:15:54 +110065#include <iostream>
66#include <map>
67#include <string>
68#include <vector>
69
70// Wuffs ships as a "single file C library" or "header file library" as per
71// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
72//
73// To use that single file as a "foo.c"-like implementation, instead of a
74// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
75// compiling it.
76#define WUFFS_IMPLEMENTATION
77
78// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of
79// release/c/etc.c whitelist which parts of Wuffs to build. That file contains
80// the entire Wuffs standard library, implementing a variety of codecs and file
81// formats. Without this macro definition, an optimizing compiler or linker may
82// very well discard Wuffs code for unused codecs, but listing the Wuffs
83// modules we use makes that process explicit. Preprocessing means that such
84// code simply isn't compiled.
85#define WUFFS_CONFIG__MODULES
Nigel Taocf6c5782020-08-03 23:43:45 +100086#define WUFFS_CONFIG__MODULE__AUX__BASE
87#define WUFFS_CONFIG__MODULE__AUX__JSON
Nigel Taod0b16cb2020-03-14 10:15:54 +110088#define WUFFS_CONFIG__MODULE__BASE
89#define WUFFS_CONFIG__MODULE__JSON
90
91// If building this program in an environment that doesn't easily accommodate
92// relative includes, you can use the script/inline-c-relative-includes.go
93// program to generate a stand-alone C++ file.
94#include "../../release/c/wuffs-unsupported-snapshot.c"
95
96#define TRY(error_msg) \
97 do { \
98 std::string z = error_msg; \
99 if (!z.empty()) { \
100 return z; \
101 } \
102 } while (false)
103
Nigel Taod60815c2020-03-26 14:32:35 +1100104static const char* g_usage =
Nigel Taod0b16cb2020-03-14 10:15:54 +1100105 "Usage: jsonfindptrs -flags input.json\n"
106 "\n"
107 "Flags:\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100108 " -d=NUM -max-output-depth=NUM\n"
Nigel Tao0a5940d2020-08-07 13:15:41 +1000109 " -input-allow-comments\n"
110 " -input-allow-extra-comma\n"
111 " -input-allow-inf-nan-numbers\n"
Nigel Taoecadf722020-07-13 08:22:34 +1000112 " -strict-json-pointer-syntax\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100113 "\n"
114 "The input.json filename is optional. If absent, it reads from stdin.\n"
115 "\n"
116 "----\n"
117 "\n"
118 "jsonfindptrs reads UTF-8 JSON from stdin and writes every node's JSON\n"
119 "Pointer (RFC 6901) to stdout.\n"
120 "\n"
121 "For example, given RFC 6901 section 5's sample input\n"
122 "(https://tools.ietf.org/rfc/rfc6901.txt), this command:\n"
123 " jsonfindptrs rfc-6901-json-pointer.json\n"
124 "will print:\n"
125 " \n"
126 " /\n"
127 " / \n"
128 " /a~1b\n"
129 " /c%d\n"
130 " /e^f\n"
131 " /foo\n"
132 " /foo/0\n"
133 " /foo/1\n"
134 " /g|h\n"
135 " /i\\j\n"
136 " /k\"l\n"
137 " /m~0n\n"
138 "\n"
139 "The first three lines are (1) a 0-byte \"\", (2) a 1-byte \"/\" and (3)\n"
140 "a 2-byte \"/ \". Unlike a file system, the \"/\" JSON Pointer does not\n"
141 "identify the root. Instead, \"\" is the root and \"/\" is the child (the\n"
142 "value in a key-value pair) of the root whose key is the empty string.\n"
143 "Similarly, \"/xyz\" and \"/xyz/\" are two different nodes.\n"
144 "\n"
145 "----\n"
146 "\n"
147 "The JSON specification (https://json.org/) permits implementations that\n"
148 "allow duplicate keys, but this one does not. Conversely, it prints keys\n"
149 "in sorted order, but the overall output is not necessarily sorted\n"
150 "lexicographically. For example, \"/a/9\" would come before \"/a/10\",\n"
151 "and \"/b/c\", a child of \"/b\", would come before \"/b+\".\n"
152 "\n"
153 "This JSON implementation also rejects integer values outside ±M, where\n"
154 "M is ((1<<53)-1), also known as JavaScript's Number.MAX_SAFE_INTEGER.\n"
155 "\n"
Nigel Tao0a5940d2020-08-07 13:15:41 +1000156 "The -input-allow-comments flag allows \"/*slash-star*/\" and\n"
157 "\"//slash-slash\" C-style comments within JSON input.\n"
158 "\n"
159 "The -input-allow-extra-comma flag allows input like \"[1,2,]\", with a\n"
Nigel Taoc766bb72020-07-09 12:59:32 +1000160 "comma after the final element of a JSON list or dictionary.\n"
161 "\n"
Nigel Tao0a5940d2020-08-07 13:15:41 +1000162 "The -input-allow-inf-nan-numbers flag allows non-finite floating point\n"
163 "numbers (infinities and not-a-numbers) within JSON input.\n"
164 "\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100165 "----\n"
166 "\n"
Nigel Taoecadf722020-07-13 08:22:34 +1000167 "The -strict-json-pointer-syntax flag restricts the output lines to\n"
168 "exactly RFC 6901, with only two escape sequences: \"~0\" and \"~1\" for\n"
169 "\"~\" and \"/\". Without this flag, this program also lets \"~n\" and\n"
170 "\"~r\" escape the New Line and Carriage Return ASCII control characters,\n"
171 "which can work better with line oriented Unix tools that assume exactly\n"
172 "one value (i.e. one JSON Pointer string) per line. With this flag, it\n"
173 "fails if the input JSON's keys contain \"\\u000A\" or \"\\u000D\".\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100174 "\n"
175 "----\n"
176 "\n"
177 "The JSON specification permits implementations to set their own maximum\n"
178 "input depth. This JSON implementation sets it to 1024.\n"
179 "\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100180 "The -d=NUM or -max-output-depth=NUM flag gives the maximum (inclusive)\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100181 "output depth. JSON containers ([] arrays and {} objects) can hold other\n"
Nigel Tao94440cf2020-04-02 22:28:24 +1100182 "containers. A bare -d or -max-output-depth is equivalent to -d=1,\n"
Nigel Taod0b16cb2020-03-14 10:15:54 +1100183 "analogous to the Unix ls command. The flag's absence is equivalent to an\n"
184 "unlimited output depth, analogous to the Unix find command (and hence\n"
185 "the name of this program: jsonfindptrs).";
186
187// ----
188
Nigel Taocf6c5782020-08-03 23:43:45 +1000189std::vector<uint32_t> g_quirks;
190
Nigel Taod0b16cb2020-03-14 10:15:54 +1100191struct {
192 int remaining_argc;
193 char** remaining_argv;
194
195 uint32_t max_output_depth;
196 bool strict_json_pointer_syntax;
Nigel Taod60815c2020-03-26 14:32:35 +1100197} g_flags = {0};
Nigel Taod0b16cb2020-03-14 10:15:54 +1100198
199std::string //
200parse_flags(int argc, char** argv) {
Nigel Taod60815c2020-03-26 14:32:35 +1100201 g_flags.max_output_depth = 0xFFFFFFFF;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100202
203 int c = (argc > 0) ? 1 : 0; // Skip argv[0], the program name.
204 for (; c < argc; c++) {
205 char* arg = argv[c];
206 if (*arg++ != '-') {
207 break;
208 }
209
210 // A double-dash "--foo" is equivalent to a single-dash "-foo". As special
211 // cases, a bare "-" is not a flag (some programs may interpret it as
212 // stdin) and a bare "--" means to stop parsing flags.
213 if (*arg == '\x00') {
214 break;
215 } else if (*arg == '-') {
216 arg++;
217 if (*arg == '\x00') {
218 c++;
219 break;
220 }
221 }
222
Nigel Tao94440cf2020-04-02 22:28:24 +1100223 if (!strcmp(arg, "d") || !strcmp(arg, "max-output-depth")) {
Nigel Taod60815c2020-03-26 14:32:35 +1100224 g_flags.max_output_depth = 1;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100225 continue;
Nigel Tao94440cf2020-04-02 22:28:24 +1100226 } else if (!strncmp(arg, "d=", 2) ||
Nigel Taod0b16cb2020-03-14 10:15:54 +1100227 !strncmp(arg, "max-output-depth=", 16)) {
228 while (*arg++ != '=') {
229 }
230 wuffs_base__result_u64 u = wuffs_base__parse_number_u64(
Nigel Tao6b7ce302020-07-07 16:19:46 +1000231 wuffs_base__make_slice_u8((uint8_t*)arg, strlen(arg)),
232 WUFFS_BASE__PARSE_NUMBER_XXX__DEFAULT_OPTIONS);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100233 if (wuffs_base__status__is_ok(&u.status) && (u.value <= 0xFFFFFFFF)) {
Nigel Taod60815c2020-03-26 14:32:35 +1100234 g_flags.max_output_depth = (uint32_t)(u.value);
Nigel Taod0b16cb2020-03-14 10:15:54 +1100235 continue;
236 }
Nigel Taod60815c2020-03-26 14:32:35 +1100237 return g_usage;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100238 }
Nigel Tao0a5940d2020-08-07 13:15:41 +1000239 if (!strcmp(arg, "input-allow-comments")) {
240 g_quirks.push_back(WUFFS_JSON__QUIRK_ALLOW_COMMENT_BLOCK);
241 g_quirks.push_back(WUFFS_JSON__QUIRK_ALLOW_COMMENT_LINE);
242 continue;
243 }
244 if (!strcmp(arg, "input-allow-extra-comma")) {
Nigel Taocf6c5782020-08-03 23:43:45 +1000245 g_quirks.push_back(WUFFS_JSON__QUIRK_ALLOW_EXTRA_COMMA);
Nigel Taoc766bb72020-07-09 12:59:32 +1000246 continue;
247 }
Nigel Tao0a5940d2020-08-07 13:15:41 +1000248 if (!strcmp(arg, "input-allow-inf-nan-numbers")) {
249 g_quirks.push_back(WUFFS_JSON__QUIRK_ALLOW_INF_NAN_NUMBERS);
250 continue;
251 }
Nigel Taoecadf722020-07-13 08:22:34 +1000252 if (!strcmp(arg, "strict-json-pointer-syntax")) {
Nigel Taod60815c2020-03-26 14:32:35 +1100253 g_flags.strict_json_pointer_syntax = true;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100254 continue;
255 }
256
Nigel Taod60815c2020-03-26 14:32:35 +1100257 return g_usage;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100258 }
259
Nigel Taod60815c2020-03-26 14:32:35 +1100260 g_flags.remaining_argc = argc - c;
261 g_flags.remaining_argv = argv + c;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100262 return "";
263}
264
Nigel Tao6b7ce302020-07-07 16:19:46 +1000265// ----
Nigel Taod0b16cb2020-03-14 10:15:54 +1100266
Nigel Taod0b16cb2020-03-14 10:15:54 +1100267class JsonThing {
268 public:
Nigel Taod0b16cb2020-03-14 10:15:54 +1100269 using Vector = std::vector<JsonThing>;
270
271 // We use a std::map in this example program to avoid dependencies outside of
272 // the C++ standard library. If you're copy/pasting this JsonThing code,
273 // consider a more efficient data structure such as an absl::btree_map.
274 //
275 // See CppCon 2014: Chandler Carruth "Efficiency with Algorithms, Performance
276 // with Data Structures" at https://www.youtube.com/watch?v=fHNmRkzxHWs
277 using Map = std::map<std::string, JsonThing>;
278
279 enum class Kind {
280 Null,
281 Bool,
282 Int64,
283 Float64,
284 String,
285 Array,
286 Object,
287 } kind = Kind::Null;
288
289 struct Value {
290 bool b = false;
291 int64_t i = 0;
292 double f = 0;
293 std::string s;
294 Vector a;
295 Map o;
296 } value;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100297};
298
Nigel Taod0b16cb2020-03-14 10:15:54 +1100299// ----
300
301std::string //
302escape(std::string s) {
303 for (char& c : s) {
304 if ((c == '~') || (c == '/') || (c == '\n') || (c == '\r')) {
305 goto escape_needed;
306 }
307 }
308 return s;
309
310escape_needed:
311 std::string e;
312 e.reserve(8 + s.length());
313 for (char& c : s) {
314 switch (c) {
315 case '~':
316 e += "~0";
317 break;
318 case '/':
319 e += "~1";
320 break;
321 case '\n':
Nigel Taod60815c2020-03-26 14:32:35 +1100322 if (g_flags.strict_json_pointer_syntax) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100323 return "";
324 }
325 e += "~n";
326 break;
327 case '\r':
Nigel Taod60815c2020-03-26 14:32:35 +1100328 if (g_flags.strict_json_pointer_syntax) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100329 return "";
330 }
331 e += "~r";
332 break;
333 default:
334 e += c;
335 break;
336 }
337 }
338 return e;
339}
340
341std::string //
342print_json_pointers(JsonThing& jt, std::string s, uint32_t depth) {
343 std::cout << s << std::endl;
Nigel Taod60815c2020-03-26 14:32:35 +1100344 if (depth++ >= g_flags.max_output_depth) {
Nigel Taod0b16cb2020-03-14 10:15:54 +1100345 return "";
346 }
347
348 switch (jt.kind) {
349 case JsonThing::Kind::Array:
350 s += "/";
351 for (size_t i = 0; i < jt.value.a.size(); i++) {
352 TRY(print_json_pointers(jt.value.a[i], s + std::to_string(i), depth));
353 }
354 break;
355 case JsonThing::Kind::Object:
356 s += "/";
357 for (auto& kv : jt.value.o) {
358 std::string e = escape(kv.first);
359 if (e.empty() && !kv.first.empty()) {
360 return "main: unsupported \"\\u000A\" or \"\\u000D\" in object key";
361 }
362 TRY(print_json_pointers(kv.second, s + e, depth));
363 }
364 break;
Nigel Tao18ef5b42020-03-16 10:37:47 +1100365 default:
366 break;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100367 }
368 return "";
369}
370
Nigel Taocf6c5782020-08-03 23:43:45 +1000371// ----
372
373class Callbacks : public wuffs_aux::DecodeJsonCallbacks {
374 public:
375 struct Entry {
376 Entry(JsonThing&& jt)
377 : thing(std::move(jt)), has_map_key(false), map_key() {}
378
379 JsonThing thing;
380 bool has_map_key;
381 std::string map_key;
382 };
383
384 Callbacks() = default;
385
386 std::string Append(JsonThing&& jt) {
387 if (m_stack.empty()) {
388 m_stack.push_back(Entry(std::move(jt)));
389 return "";
390 }
391 Entry& top = m_stack.back();
392 switch (top.thing.kind) {
393 case JsonThing::Kind::Array:
394 top.thing.value.a.push_back(std::move(jt));
395 return "";
396 case JsonThing::Kind::Object:
397 if (top.has_map_key) {
398 top.has_map_key = false;
399 auto iter = top.thing.value.o.find(top.map_key);
400 if (iter != top.thing.value.o.end()) {
401 return "main: duplicate key: " + top.map_key;
402 }
403 top.thing.value.o.insert(
404 iter, JsonThing::Map::value_type(std::move(top.map_key),
405 std::move(jt)));
406 return "";
407 } else if (jt.kind == JsonThing::Kind::String) {
408 top.has_map_key = true;
409 top.map_key = std::move(jt.value.s);
410 return "";
411 }
412 return "main: internal error: non-string map key";
413 }
414 return "main: internal error: non-container stack entry";
415 }
416
417 virtual std::string AppendNull() {
418 JsonThing jt;
419 jt.kind = JsonThing::Kind::Null;
420 return Append(std::move(jt));
421 }
422
423 virtual std::string AppendBool(bool val) {
424 JsonThing jt;
425 jt.kind = JsonThing::Kind::Bool;
426 jt.value.b = val;
427 return Append(std::move(jt));
428 }
429
430 virtual std::string AppendI64(int64_t val) {
431 JsonThing jt;
432 jt.kind = JsonThing::Kind::Int64;
433 jt.value.i = val;
434 return Append(std::move(jt));
435 }
436
437 virtual std::string AppendF64(double val) {
438 JsonThing jt;
439 jt.kind = JsonThing::Kind::Float64;
440 jt.value.f = val;
441 return Append(std::move(jt));
442 }
443
Nigel Tao6d817642020-08-09 23:11:42 +1000444 virtual std::string AppendTextString(std::string&& val) {
Nigel Taocf6c5782020-08-03 23:43:45 +1000445 JsonThing jt;
446 jt.kind = JsonThing::Kind::String;
447 jt.value.s = std::move(val);
448 return Append(std::move(jt));
449 }
450
451 virtual std::string Push(uint32_t flags) {
452 if (flags & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST) {
453 JsonThing jt;
454 jt.kind = JsonThing::Kind::Array;
455 m_stack.push_back(std::move(jt));
456 return "";
457 } else if (flags & WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT) {
458 JsonThing jt;
459 jt.kind = JsonThing::Kind::Object;
460 m_stack.push_back(std::move(jt));
461 return "";
462 }
463 return "main: internal error: bad push";
464 }
465
466 virtual std::string Pop(uint32_t flags) {
467 if (m_stack.empty()) {
468 return "main: internal error: bad pop";
469 }
470 JsonThing jt = std::move(m_stack.back().thing);
471 m_stack.pop_back();
472 return Append(std::move(jt));
473 }
474
475 virtual void Done(wuffs_aux::DecodeJsonResult& result,
476 wuffs_aux::sync_io::Input& input,
477 wuffs_aux::IOBuffer& buffer) {
478 if (!result.error_message.empty()) {
479 return;
480 } else if (m_stack.size() != 1) {
481 result.error_message = "main: internal error: bad depth";
482 return;
483 }
484 result.error_message = print_json_pointers(m_stack.back().thing, "", 0);
485 }
486
487 private:
488 std::vector<Entry> m_stack;
489};
490
491// ----
492
Nigel Taod0b16cb2020-03-14 10:15:54 +1100493std::string //
494main1(int argc, char** argv) {
495 TRY(parse_flags(argc, argv));
496
Nigel Taocf6c5782020-08-03 23:43:45 +1000497 FILE* in = stdin;
Nigel Taod60815c2020-03-26 14:32:35 +1100498 if (g_flags.remaining_argc > 1) {
499 return g_usage;
500 } else if (g_flags.remaining_argc == 1) {
Nigel Taocf6c5782020-08-03 23:43:45 +1000501 in = fopen(g_flags.remaining_argv[0], "r");
502 if (!in) {
503 return std::string("main: cannot read input file");
Nigel Taod0b16cb2020-03-14 10:15:54 +1100504 }
505 }
506
Nigel Taocf6c5782020-08-03 23:43:45 +1000507 return wuffs_aux::DecodeJson(
508 Callbacks(), wuffs_aux::sync_io::FileInput(in),
509 wuffs_base__make_slice_u32(g_quirks.data(), g_quirks.size()))
510 .error_message;
Nigel Taod0b16cb2020-03-14 10:15:54 +1100511}
512
513// ----
514
515int //
516compute_exit_code(std::string status_msg) {
517 if (status_msg.empty()) {
518 return 0;
519 }
520 std::cerr << status_msg << std::endl;
521 // Return an exit code of 1 for regular (forseen) errors, e.g. badly
522 // formatted or unsupported input.
523 //
524 // Return an exit code of 2 for internal (exceptional) errors, e.g. defensive
525 // run-time checks found that an internal invariant did not hold.
526 //
527 // Automated testing, including badly formatted inputs, can therefore
528 // discriminate between expected failure (exit code 1) and unexpected failure
529 // (other non-zero exit codes). Specifically, exit code 2 for internal
530 // invariant violation, exit code 139 (which is 128 + SIGSEGV on x86_64
531 // linux) for a segmentation fault (e.g. null pointer dereference).
532 return (status_msg.find("internal error:") != std::string::npos) ? 2 : 1;
533}
534
535int //
536main(int argc, char** argv) {
537 std::string z = main1(argc, argv);
538 int exit_code = compute_exit_code(z);
539 return exit_code;
540}