blob: 1343ab971cb26303679b4512d430194bf0ff64b3 [file] [log] [blame]
Nigel Tao737e31f2020-02-11 11:23:17 +11001// After editing this file, run "go generate" in the parent directory.
2
3// Copyright 2020 The Wuffs Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// https://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17// ---------------- Tokens
18
Nigel Tao737e31f2020-02-11 11:23:17 +110019typedef struct {
Nigel Taod1c928a2020-02-28 12:43:53 +110020 // The repr's 64 bits are divided as:
21 //
22 // +-----+-------------+-------+-------------+-----+-----+-----------+
23 // | 1 | 21 | 3 | 21 | 1 | 1 | 16 |
24 // +-----+-------------+-------+-------------+-----+-----+-----------+
25 // [..................value..................] LP LN length
26 // [..0..|.value_major.|.....value_minor.....]
27 // [..0..|.........VBC.........|.....VBD.....]
28 //
29 // The broad divisions are:
30 // - Bits 63 .. 18 (46 bits) is the value.
31 // - Bits 17 .. 16 ( 2 bits) is LP and LN (link_prev and link_next).
Nigel Taoa9d14882020-02-25 12:12:31 +110032 // - Bits 15 .. 0 (16 bits) is the length.
33 //
Nigel Taod1c928a2020-02-28 12:43:53 +110034 // The value bits can be sub-divided in multiple ways:
35 // - Bits 63 .. 63 ( 1 bits) is reserved (a zero bit).
36 // - Bits 62 .. 42 (21 bits) is the value_major.
37 // - Bits 41 .. 18 (24 bits) is the value_minor.
38 // - Bits 62 .. 39 (24 bits) is the VBC (value_base_category).
39 // - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).
Nigel Taoa9d14882020-02-25 12:12:31 +110040 //
Nigel Taod1c928a2020-02-28 12:43:53 +110041 // The value_major is a 21-bit [Base38](doc/note/base38-and-fourcc.md) value.
42 // If all of its bits are zero (special cased for Wuffs' built-in "base"
43 // package) then the value_minor is further sub-divided:
44 // - Bits 41 .. 39 ( 3 bits) is the VBC (value_base_category).
45 // - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).
Nigel Taoa9d14882020-02-25 12:12:31 +110046 //
Nigel Taod1c928a2020-02-28 12:43:53 +110047 // The high 46 bits (bits 63 .. 18) only have VBC and VBD semantics when the
48 // high 22 bits (the value_major) are all zero. An equivalent test is that
49 // the high 25 bits (the notional VBC) has a numerical value less than 8.
50 //
51 // At 21 bits, the VBD can hold every valid Unicode code point.
52 //
53 // If value_major is non-zero then value_minor has whatever arbitrary meaning
54 // the tokenizer's package assigns to it.
55 //
56 // Multiple consecutive tokens can form a larger conceptual unit. For
57 // example, an "abc\tz" string is a single higher level concept but at the
58 // lower level, it could consist of multiple tokens: the quotes '"', the
59 // ASCII texts "abc" and "z" and the backslash-escaped tab '\t'. The LP and
60 // LN (link_prev and link_next) bits denote tokens that are part of a
61 // multi-token chain:
62 // - LP means that this token is not the first (there is a previous token).
63 // - LN means that this token is not the last (there is a next token).
64 //
65 // In particular, a stand-alone token will have both link bits set to zero.
Nigel Tao737e31f2020-02-11 11:23:17 +110066 uint64_t repr;
Nigel Tao36857982020-02-12 11:33:13 +110067
68#ifdef __cplusplus
69 inline uint64_t value() const;
70 inline uint64_t value_major() const;
71 inline uint64_t value_minor() const;
72 inline uint64_t value_base_category() const;
73 inline uint64_t value_base_detail() const;
Nigel Taod1c928a2020-02-28 12:43:53 +110074 inline bool link_prev() const;
75 inline bool link_next() const;
Nigel Tao36857982020-02-12 11:33:13 +110076 inline uint64_t length() const;
77#endif // __cplusplus
78
Nigel Tao737e31f2020-02-11 11:23:17 +110079} wuffs_base__token;
80
81static inline wuffs_base__token //
82wuffs_base__make_token(uint64_t repr) {
83 wuffs_base__token ret;
84 ret.repr = repr;
85 return ret;
86}
87
Nigel Tao9d4eeb72020-02-26 11:36:30 +110088 // --------
89
Nigel Taod1c928a2020-02-28 12:43:53 +110090#define WUFFS_BASE__TOKEN__VALUE__MASK 0x3FFFFFFFFFFF
91#define WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK 0x3FFFFF
Nigel Taoa9d14882020-02-25 12:12:31 +110092#define WUFFS_BASE__TOKEN__VALUE_MINOR__MASK 0xFFFFFF
Nigel Taod1c928a2020-02-28 12:43:53 +110093#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK 0x1FFFFFF
Nigel Taoa9d14882020-02-25 12:12:31 +110094#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK 0x1FFFFF
Nigel Taod1c928a2020-02-28 12:43:53 +110095#define WUFFS_BASE__TOKEN__LINK__MASK 0x3
Nigel Taoa9d14882020-02-25 12:12:31 +110096#define WUFFS_BASE__TOKEN__LENGTH__MASK 0xFFFF
97
Nigel Taod1c928a2020-02-28 12:43:53 +110098#define WUFFS_BASE__TOKEN__VALUE__SHIFT 18
99#define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42
100#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18
101#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 39
102#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 18
103#define WUFFS_BASE__TOKEN__LINK__SHIFT 16
Nigel Taoa9d14882020-02-25 12:12:31 +1100104#define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0
105
Nigel Taod1c928a2020-02-28 12:43:53 +1100106#define WUFFS_BASE__TOKEN__LINK_PREV 0x20000
107#define WUFFS_BASE__TOKEN__LINK_NEXT 0x10000
108
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100109 // --------
110
Nigel Taoa9d14882020-02-25 12:12:31 +1100111#define WUFFS_BASE__TOKEN__VBC__FILLER 0
Nigel Tao85fba7f2020-02-29 16:28:06 +1100112#define WUFFS_BASE__TOKEN__VBC__STRUCTURE 1
113#define WUFFS_BASE__TOKEN__VBC__STRING 2
114#define WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT 3
115#define WUFFS_BASE__TOKEN__VBC__LITERAL 4
116#define WUFFS_BASE__TOKEN__VBC__NUMBER 5
Nigel Taoa9d14882020-02-25 12:12:31 +1100117
Nigel Taod1c928a2020-02-28 12:43:53 +1100118 // --------
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100119
Nigel Taod1c928a2020-02-28 12:43:53 +1100120#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE 0x00001
121#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_BLOCK 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +1100122
Nigel Tao85fba7f2020-02-29 16:28:06 +1100123 // --------
124
125#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH 0x00001
126#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP 0x00002
127#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_NONE 0x00010
128#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST 0x00020
129#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_DICT 0x00040
130#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_NONE 0x01000
131#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST 0x02000
132#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT 0x04000
133
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100134// --------
135
Nigel Taod1c928a2020-02-28 12:43:53 +1100136// "DEFINITELY_FOO" means that the destination bytes (and also the source
137// bytes, for 1_DST_1_SRC_COPY) are in the FOO format. Definitely means that
138// the lack of the bit is conservative: it is valid for all-ASCII strings to
139// have neither DEFINITELY_UTF_8 or DEFINITELY_ASCII bits set.
140#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_UTF_8 0x00001
141#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_ASCII 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +1100142
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100143// "CONVERT_D_DST_S_SRC" means that multiples of S source bytes (possibly
144// padded) produces multiples of D destination bytes. For example,
145// CONVERT_1_DST_4_SRC_BACKSLASH_X means a source like "\\x23\\x67\\xAB", where
146// 12 src bytes encode 3 dst bytes.
147//
148// When src is the empty string, multiple conversion algorithms are applicable
149// (so these bits are not necessarily mutually exclusive), all producing the
150// same empty dst string.
Nigel Taod1c928a2020-02-28 12:43:53 +1100151#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP 0x00010
152#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY 0x00020
153#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_2_SRC_HEXADECIMAL 0x00040
154#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_4_SRC_BACKSLASH_X 0x00080
155#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_STD 0x00100
156#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_URL 0x00200
157#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_4_DST_5_SRC_ASCII_85 0x00400
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100158
159 // --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100160
Nigel Tao85fba7f2020-02-29 16:28:06 +1100161#define WUFFS_BASE__TOKEN__VBD__LITERAL__UNDEFINED 0x00001
162#define WUFFS_BASE__TOKEN__VBD__LITERAL__NULL 0x00002
163#define WUFFS_BASE__TOKEN__VBD__LITERAL__FALSE 0x00004
164#define WUFFS_BASE__TOKEN__VBD__LITERAL__TRUE 0x00008
165
166 // --------
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100167
Nigel Taoa9d14882020-02-25 12:12:31 +1100168// For a source string of "123" or "0x9A", it is valid for a tokenizer to
169// return any one of:
170// - WUFFS_BASE__TOKEN__VBD__NUMBER__FLOATING_POINT.
171// - WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_SIGNED.
172// - WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_UNSIGNED.
173//
174// For a source string of "+123" or "-0x9A", only the first two are valid.
175//
176// For a source string of "123.", only the first one is valid.
Nigel Tao85fba7f2020-02-29 16:28:06 +1100177#define WUFFS_BASE__TOKEN__VBD__NUMBER__FLOATING_POINT 0x00001
178#define WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_SIGNED 0x00002
179#define WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_UNSIGNED 0x00004
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100180
181// --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100182
Nigel Tao36857982020-02-12 11:33:13 +1100183static inline uint64_t //
184wuffs_base__token__value(const wuffs_base__token* t) {
185 return (t->repr >> WUFFS_BASE__TOKEN__VALUE__SHIFT) &
186 WUFFS_BASE__TOKEN__VALUE__MASK;
187}
188
189static inline uint64_t //
190wuffs_base__token__value_major(const wuffs_base__token* t) {
191 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT) &
192 WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK;
193}
194
195static inline uint64_t //
196wuffs_base__token__value_minor(const wuffs_base__token* t) {
197 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) &
198 WUFFS_BASE__TOKEN__VALUE_MINOR__MASK;
199}
200
201static inline uint64_t //
202wuffs_base__token__value_base_category(const wuffs_base__token* t) {
203 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT) &
204 WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK;
205}
206
207static inline uint64_t //
208wuffs_base__token__value_base_detail(const wuffs_base__token* t) {
209 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) &
210 WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK;
211}
212
Nigel Taod1c928a2020-02-28 12:43:53 +1100213static inline bool //
214wuffs_base__token__link_prev(const wuffs_base__token* t) {
215 return t->repr & WUFFS_BASE__TOKEN__LINK_PREV;
216}
217
218static inline bool //
219wuffs_base__token__link_next(const wuffs_base__token* t) {
220 return t->repr & WUFFS_BASE__TOKEN__LINK_NEXT;
221}
222
Nigel Tao36857982020-02-12 11:33:13 +1100223static inline uint64_t //
224wuffs_base__token__length(const wuffs_base__token* t) {
225 return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) &
226 WUFFS_BASE__TOKEN__LENGTH__MASK;
227}
228
229#ifdef __cplusplus
230
231inline uint64_t //
232wuffs_base__token::value() const {
233 return wuffs_base__token__value(this);
234}
235
236inline uint64_t //
237wuffs_base__token::value_major() const {
238 return wuffs_base__token__value_major(this);
239}
240
241inline uint64_t //
242wuffs_base__token::value_minor() const {
243 return wuffs_base__token__value_minor(this);
244}
245
246inline uint64_t //
247wuffs_base__token::value_base_category() const {
248 return wuffs_base__token__value_base_category(this);
249}
250
251inline uint64_t //
252wuffs_base__token::value_base_detail() const {
253 return wuffs_base__token__value_base_detail(this);
254}
255
Nigel Taod1c928a2020-02-28 12:43:53 +1100256inline bool //
257wuffs_base__token::link_prev() const {
258 return wuffs_base__token__link_prev(this);
259}
260
261inline bool //
262wuffs_base__token::link_next() const {
263 return wuffs_base__token__link_next(this);
264}
265
Nigel Tao36857982020-02-12 11:33:13 +1100266inline uint64_t //
267wuffs_base__token::length() const {
268 return wuffs_base__token__length(this);
269}
270
271#endif // __cplusplus
272
273// --------
274
Nigel Tao737e31f2020-02-11 11:23:17 +1100275typedef WUFFS_BASE__SLICE(wuffs_base__token) wuffs_base__slice_token;
276
277static inline wuffs_base__slice_token //
278wuffs_base__make_slice_token(wuffs_base__token* ptr, size_t len) {
279 wuffs_base__slice_token ret;
280 ret.ptr = ptr;
281 ret.len = len;
282 return ret;
283}
284
Nigel Tao36857982020-02-12 11:33:13 +1100285// --------
286
Nigel Tao737e31f2020-02-11 11:23:17 +1100287// wuffs_base__token_buffer_meta is the metadata for a
288// wuffs_base__token_buffer's data.
289typedef struct {
290 size_t wi; // Write index. Invariant: wi <= len.
291 size_t ri; // Read index. Invariant: ri <= wi.
292 uint64_t pos; // Position of the buffer start relative to the stream start.
293 bool closed; // No further writes are expected.
294} wuffs_base__token_buffer_meta;
295
296// wuffs_base__token_buffer is a 1-dimensional buffer (a pointer and length)
297// plus additional metadata.
298//
299// A value with all fields zero is a valid, empty buffer.
300typedef struct {
301 wuffs_base__slice_token data;
302 wuffs_base__token_buffer_meta meta;
303
304#ifdef __cplusplus
305 inline void compact();
306 inline uint64_t reader_available() const;
307 inline uint64_t reader_token_position() const;
308 inline uint64_t writer_available() const;
309 inline uint64_t writer_token_position() const;
310#endif // __cplusplus
311
312} wuffs_base__token_buffer;
313
314static inline wuffs_base__token_buffer //
315wuffs_base__make_token_buffer(wuffs_base__slice_token data,
316 wuffs_base__token_buffer_meta meta) {
317 wuffs_base__token_buffer ret;
318 ret.data = data;
319 ret.meta = meta;
320 return ret;
321}
322
323static inline wuffs_base__token_buffer_meta //
324wuffs_base__make_token_buffer_meta(size_t wi,
325 size_t ri,
326 uint64_t pos,
327 bool closed) {
328 wuffs_base__token_buffer_meta ret;
329 ret.wi = wi;
330 ret.ri = ri;
331 ret.pos = pos;
332 ret.closed = closed;
333 return ret;
334}
335
336static inline wuffs_base__token_buffer //
337wuffs_base__empty_token_buffer() {
338 wuffs_base__token_buffer ret;
339 ret.data.ptr = NULL;
340 ret.data.len = 0;
341 ret.meta.wi = 0;
342 ret.meta.ri = 0;
343 ret.meta.pos = 0;
344 ret.meta.closed = false;
345 return ret;
346}
347
348static inline wuffs_base__token_buffer_meta //
349wuffs_base__empty_token_buffer_meta() {
350 wuffs_base__token_buffer_meta ret;
351 ret.wi = 0;
352 ret.ri = 0;
353 ret.pos = 0;
354 ret.closed = false;
355 return ret;
356}
357
358// wuffs_base__token_buffer__compact moves any written but unread tokens to the
359// start of the buffer.
360static inline void //
361wuffs_base__token_buffer__compact(wuffs_base__token_buffer* buf) {
362 if (!buf || (buf->meta.ri == 0)) {
363 return;
364 }
365 buf->meta.pos = wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri);
366 size_t n = buf->meta.wi - buf->meta.ri;
367 if (n != 0) {
368 memmove(buf->data.ptr, buf->data.ptr + buf->meta.ri,
369 n * sizeof(wuffs_base__token));
370 }
371 buf->meta.wi = n;
372 buf->meta.ri = 0;
373}
374
375static inline uint64_t //
376wuffs_base__token_buffer__reader_available(
377 const wuffs_base__token_buffer* buf) {
378 return buf ? buf->meta.wi - buf->meta.ri : 0;
379}
380
381static inline uint64_t //
382wuffs_base__token_buffer__reader_token_position(
383 const wuffs_base__token_buffer* buf) {
384 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri) : 0;
385}
386
387static inline uint64_t //
388wuffs_base__token_buffer__writer_available(
389 const wuffs_base__token_buffer* buf) {
390 return buf ? buf->data.len - buf->meta.wi : 0;
391}
392
393static inline uint64_t //
394wuffs_base__token_buffer__writer_token_position(
395 const wuffs_base__token_buffer* buf) {
396 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.wi) : 0;
397}
398
399#ifdef __cplusplus
400
401inline void //
402wuffs_base__token_buffer::compact() {
403 wuffs_base__token_buffer__compact(this);
404}
405
406inline uint64_t //
407wuffs_base__token_buffer::reader_available() const {
408 return wuffs_base__token_buffer__reader_available(this);
409}
410
411inline uint64_t //
412wuffs_base__token_buffer::reader_token_position() const {
413 return wuffs_base__token_buffer__reader_token_position(this);
414}
415
416inline uint64_t //
417wuffs_base__token_buffer::writer_available() const {
418 return wuffs_base__token_buffer__writer_available(this);
419}
420
421inline uint64_t //
422wuffs_base__token_buffer::writer_token_position() const {
423 return wuffs_base__token_buffer__writer_token_position(this);
424}
425
426#endif // __cplusplus