blob: 31f02e7301b144758bff961bc2fcf5a99cc6889d [file] [log] [blame]
Nigel Tao737e31f2020-02-11 11:23:17 +11001// After editing this file, run "go generate" in the parent directory.
2
3// Copyright 2020 The Wuffs Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// https://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17// ---------------- Tokens
18
Nigel Tao737e31f2020-02-11 11:23:17 +110019typedef struct {
Nigel Taod1c928a2020-02-28 12:43:53 +110020 // The repr's 64 bits are divided as:
21 //
22 // +-----+-------------+-------+-------------+-----+-----+-----------+
23 // | 1 | 21 | 3 | 21 | 1 | 1 | 16 |
24 // +-----+-------------+-------+-------------+-----+-----+-----------+
25 // [..................value..................] LP LN length
26 // [..0..|.value_major.|.....value_minor.....]
27 // [..0..|.........VBC.........|.....VBD.....]
28 //
29 // The broad divisions are:
30 // - Bits 63 .. 18 (46 bits) is the value.
31 // - Bits 17 .. 16 ( 2 bits) is LP and LN (link_prev and link_next).
Nigel Taoa9d14882020-02-25 12:12:31 +110032 // - Bits 15 .. 0 (16 bits) is the length.
33 //
Nigel Taod1c928a2020-02-28 12:43:53 +110034 // The value bits can be sub-divided in multiple ways:
35 // - Bits 63 .. 63 ( 1 bits) is reserved (a zero bit).
36 // - Bits 62 .. 42 (21 bits) is the value_major.
37 // - Bits 41 .. 18 (24 bits) is the value_minor.
38 // - Bits 62 .. 39 (24 bits) is the VBC (value_base_category).
39 // - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).
Nigel Taoa9d14882020-02-25 12:12:31 +110040 //
Nigel Taod1c928a2020-02-28 12:43:53 +110041 // The value_major is a 21-bit [Base38](doc/note/base38-and-fourcc.md) value.
42 // If all of its bits are zero (special cased for Wuffs' built-in "base"
43 // package) then the value_minor is further sub-divided:
44 // - Bits 41 .. 39 ( 3 bits) is the VBC (value_base_category).
45 // - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).
Nigel Taoa9d14882020-02-25 12:12:31 +110046 //
Nigel Taod1c928a2020-02-28 12:43:53 +110047 // The high 46 bits (bits 63 .. 18) only have VBC and VBD semantics when the
48 // high 22 bits (the value_major) are all zero. An equivalent test is that
49 // the high 25 bits (the notional VBC) has a numerical value less than 8.
50 //
51 // At 21 bits, the VBD can hold every valid Unicode code point.
52 //
53 // If value_major is non-zero then value_minor has whatever arbitrary meaning
54 // the tokenizer's package assigns to it.
55 //
56 // Multiple consecutive tokens can form a larger conceptual unit. For
57 // example, an "abc\tz" string is a single higher level concept but at the
58 // lower level, it could consist of multiple tokens: the quotes '"', the
59 // ASCII texts "abc" and "z" and the backslash-escaped tab '\t'. The LP and
60 // LN (link_prev and link_next) bits denote tokens that are part of a
61 // multi-token chain:
62 // - LP means that this token is not the first (there is a previous token).
63 // - LN means that this token is not the last (there is a next token).
64 //
65 // In particular, a stand-alone token will have both link bits set to zero.
Nigel Tao737e31f2020-02-11 11:23:17 +110066 uint64_t repr;
Nigel Tao36857982020-02-12 11:33:13 +110067
68#ifdef __cplusplus
69 inline uint64_t value() const;
70 inline uint64_t value_major() const;
71 inline uint64_t value_minor() const;
72 inline uint64_t value_base_category() const;
73 inline uint64_t value_base_detail() const;
Nigel Taod1c928a2020-02-28 12:43:53 +110074 inline bool link_prev() const;
75 inline bool link_next() const;
Nigel Tao36857982020-02-12 11:33:13 +110076 inline uint64_t length() const;
77#endif // __cplusplus
78
Nigel Tao737e31f2020-02-11 11:23:17 +110079} wuffs_base__token;
80
81static inline wuffs_base__token //
82wuffs_base__make_token(uint64_t repr) {
83 wuffs_base__token ret;
84 ret.repr = repr;
85 return ret;
86}
87
Nigel Tao9d4eeb72020-02-26 11:36:30 +110088 // --------
89
Nigel Tao43f0a872020-03-02 13:59:46 +110090#define WUFFS_BASE__TOKEN__LENGTH__MAX_INCL 0xFFFF
91
Nigel Taod1c928a2020-02-28 12:43:53 +110092#define WUFFS_BASE__TOKEN__VALUE__MASK 0x3FFFFFFFFFFF
93#define WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK 0x3FFFFF
Nigel Taoa9d14882020-02-25 12:12:31 +110094#define WUFFS_BASE__TOKEN__VALUE_MINOR__MASK 0xFFFFFF
Nigel Taod1c928a2020-02-28 12:43:53 +110095#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK 0x1FFFFFF
Nigel Taoa9d14882020-02-25 12:12:31 +110096#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK 0x1FFFFF
Nigel Taod1c928a2020-02-28 12:43:53 +110097#define WUFFS_BASE__TOKEN__LINK__MASK 0x3
Nigel Taoa9d14882020-02-25 12:12:31 +110098#define WUFFS_BASE__TOKEN__LENGTH__MASK 0xFFFF
99
Nigel Taod1c928a2020-02-28 12:43:53 +1100100#define WUFFS_BASE__TOKEN__VALUE__SHIFT 18
101#define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42
102#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18
103#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 39
104#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 18
105#define WUFFS_BASE__TOKEN__LINK__SHIFT 16
Nigel Taoa9d14882020-02-25 12:12:31 +1100106#define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0
107
Nigel Taod1c928a2020-02-28 12:43:53 +1100108#define WUFFS_BASE__TOKEN__LINK_PREV 0x20000
109#define WUFFS_BASE__TOKEN__LINK_NEXT 0x10000
110
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100111 // --------
112
Nigel Taoa9d14882020-02-25 12:12:31 +1100113#define WUFFS_BASE__TOKEN__VBC__FILLER 0
Nigel Tao85fba7f2020-02-29 16:28:06 +1100114#define WUFFS_BASE__TOKEN__VBC__STRUCTURE 1
115#define WUFFS_BASE__TOKEN__VBC__STRING 2
116#define WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT 3
117#define WUFFS_BASE__TOKEN__VBC__LITERAL 4
118#define WUFFS_BASE__TOKEN__VBC__NUMBER 5
Nigel Taoa9d14882020-02-25 12:12:31 +1100119
Nigel Taod1c928a2020-02-28 12:43:53 +1100120 // --------
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100121
Nigel Taod1c928a2020-02-28 12:43:53 +1100122#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE 0x00001
123#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_BLOCK 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +1100124
Nigel Tao85fba7f2020-02-29 16:28:06 +1100125 // --------
126
127#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH 0x00001
128#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP 0x00002
129#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_NONE 0x00010
130#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST 0x00020
131#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_DICT 0x00040
132#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_NONE 0x01000
133#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST 0x02000
134#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT 0x04000
135
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100136// --------
137
Nigel Taod1c928a2020-02-28 12:43:53 +1100138// "DEFINITELY_FOO" means that the destination bytes (and also the source
139// bytes, for 1_DST_1_SRC_COPY) are in the FOO format. Definitely means that
140// the lack of the bit is conservative: it is valid for all-ASCII strings to
141// have neither DEFINITELY_UTF_8 or DEFINITELY_ASCII bits set.
142#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_UTF_8 0x00001
143#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_ASCII 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +1100144
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100145// "CONVERT_D_DST_S_SRC" means that multiples of S source bytes (possibly
146// padded) produces multiples of D destination bytes. For example,
147// CONVERT_1_DST_4_SRC_BACKSLASH_X means a source like "\\x23\\x67\\xAB", where
148// 12 src bytes encode 3 dst bytes.
149//
150// When src is the empty string, multiple conversion algorithms are applicable
151// (so these bits are not necessarily mutually exclusive), all producing the
152// same empty dst string.
Nigel Taod1c928a2020-02-28 12:43:53 +1100153#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP 0x00010
154#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY 0x00020
155#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_2_SRC_HEXADECIMAL 0x00040
156#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_4_SRC_BACKSLASH_X 0x00080
157#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_STD 0x00100
158#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_URL 0x00200
159#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_4_DST_5_SRC_ASCII_85 0x00400
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100160
161 // --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100162
Nigel Tao85fba7f2020-02-29 16:28:06 +1100163#define WUFFS_BASE__TOKEN__VBD__LITERAL__UNDEFINED 0x00001
164#define WUFFS_BASE__TOKEN__VBD__LITERAL__NULL 0x00002
165#define WUFFS_BASE__TOKEN__VBD__LITERAL__FALSE 0x00004
166#define WUFFS_BASE__TOKEN__VBD__LITERAL__TRUE 0x00008
167
168 // --------
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100169
Nigel Taoa9d14882020-02-25 12:12:31 +1100170// For a source string of "123" or "0x9A", it is valid for a tokenizer to
171// return any one of:
Nigel Taoc3ca77a2020-03-11 12:06:50 +1100172// - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT.
173// - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED.
174// - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_UNSIGNED.
Nigel Taoa9d14882020-02-25 12:12:31 +1100175//
176// For a source string of "+123" or "-0x9A", only the first two are valid.
177//
178// For a source string of "123.", only the first one is valid.
Nigel Taoc3ca77a2020-03-11 12:06:50 +1100179#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT 0x00001
180#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED 0x00002
181#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_UNSIGNED 0x00004
182
183// The number 300 might be represented as "\x01\x2C", "\x2C\x01\x00\x00" or
184// "300", which are big-endian, little-endian or text. For binary formats, the
185// token length discriminates e.g. u16 little-endian vs u32 little-endian.
186#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_BINARY_BIG_ENDIAN 0x00100
187#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_BINARY_LITTLE_ENDIAN 0x00200
188#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_TEXT 0x00400
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100189
190// --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100191
Nigel Tao36857982020-02-12 11:33:13 +1100192static inline uint64_t //
193wuffs_base__token__value(const wuffs_base__token* t) {
194 return (t->repr >> WUFFS_BASE__TOKEN__VALUE__SHIFT) &
195 WUFFS_BASE__TOKEN__VALUE__MASK;
196}
197
198static inline uint64_t //
199wuffs_base__token__value_major(const wuffs_base__token* t) {
200 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT) &
201 WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK;
202}
203
204static inline uint64_t //
205wuffs_base__token__value_minor(const wuffs_base__token* t) {
206 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) &
207 WUFFS_BASE__TOKEN__VALUE_MINOR__MASK;
208}
209
210static inline uint64_t //
211wuffs_base__token__value_base_category(const wuffs_base__token* t) {
212 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT) &
213 WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK;
214}
215
216static inline uint64_t //
217wuffs_base__token__value_base_detail(const wuffs_base__token* t) {
218 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) &
219 WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK;
220}
221
Nigel Taod1c928a2020-02-28 12:43:53 +1100222static inline bool //
223wuffs_base__token__link_prev(const wuffs_base__token* t) {
224 return t->repr & WUFFS_BASE__TOKEN__LINK_PREV;
225}
226
227static inline bool //
228wuffs_base__token__link_next(const wuffs_base__token* t) {
229 return t->repr & WUFFS_BASE__TOKEN__LINK_NEXT;
230}
231
Nigel Tao36857982020-02-12 11:33:13 +1100232static inline uint64_t //
233wuffs_base__token__length(const wuffs_base__token* t) {
234 return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) &
235 WUFFS_BASE__TOKEN__LENGTH__MASK;
236}
237
238#ifdef __cplusplus
239
240inline uint64_t //
241wuffs_base__token::value() const {
242 return wuffs_base__token__value(this);
243}
244
245inline uint64_t //
246wuffs_base__token::value_major() const {
247 return wuffs_base__token__value_major(this);
248}
249
250inline uint64_t //
251wuffs_base__token::value_minor() const {
252 return wuffs_base__token__value_minor(this);
253}
254
255inline uint64_t //
256wuffs_base__token::value_base_category() const {
257 return wuffs_base__token__value_base_category(this);
258}
259
260inline uint64_t //
261wuffs_base__token::value_base_detail() const {
262 return wuffs_base__token__value_base_detail(this);
263}
264
Nigel Taod1c928a2020-02-28 12:43:53 +1100265inline bool //
266wuffs_base__token::link_prev() const {
267 return wuffs_base__token__link_prev(this);
268}
269
270inline bool //
271wuffs_base__token::link_next() const {
272 return wuffs_base__token__link_next(this);
273}
274
Nigel Tao36857982020-02-12 11:33:13 +1100275inline uint64_t //
276wuffs_base__token::length() const {
277 return wuffs_base__token__length(this);
278}
279
280#endif // __cplusplus
281
282// --------
283
Nigel Tao737e31f2020-02-11 11:23:17 +1100284typedef WUFFS_BASE__SLICE(wuffs_base__token) wuffs_base__slice_token;
285
286static inline wuffs_base__slice_token //
287wuffs_base__make_slice_token(wuffs_base__token* ptr, size_t len) {
288 wuffs_base__slice_token ret;
289 ret.ptr = ptr;
290 ret.len = len;
291 return ret;
292}
293
Nigel Tao36857982020-02-12 11:33:13 +1100294// --------
295
Nigel Tao737e31f2020-02-11 11:23:17 +1100296// wuffs_base__token_buffer_meta is the metadata for a
297// wuffs_base__token_buffer's data.
298typedef struct {
299 size_t wi; // Write index. Invariant: wi <= len.
300 size_t ri; // Read index. Invariant: ri <= wi.
301 uint64_t pos; // Position of the buffer start relative to the stream start.
302 bool closed; // No further writes are expected.
303} wuffs_base__token_buffer_meta;
304
305// wuffs_base__token_buffer is a 1-dimensional buffer (a pointer and length)
306// plus additional metadata.
307//
308// A value with all fields zero is a valid, empty buffer.
309typedef struct {
310 wuffs_base__slice_token data;
311 wuffs_base__token_buffer_meta meta;
312
313#ifdef __cplusplus
314 inline void compact();
315 inline uint64_t reader_available() const;
316 inline uint64_t reader_token_position() const;
317 inline uint64_t writer_available() const;
318 inline uint64_t writer_token_position() const;
319#endif // __cplusplus
320
321} wuffs_base__token_buffer;
322
323static inline wuffs_base__token_buffer //
324wuffs_base__make_token_buffer(wuffs_base__slice_token data,
325 wuffs_base__token_buffer_meta meta) {
326 wuffs_base__token_buffer ret;
327 ret.data = data;
328 ret.meta = meta;
329 return ret;
330}
331
332static inline wuffs_base__token_buffer_meta //
333wuffs_base__make_token_buffer_meta(size_t wi,
334 size_t ri,
335 uint64_t pos,
336 bool closed) {
337 wuffs_base__token_buffer_meta ret;
338 ret.wi = wi;
339 ret.ri = ri;
340 ret.pos = pos;
341 ret.closed = closed;
342 return ret;
343}
344
345static inline wuffs_base__token_buffer //
346wuffs_base__empty_token_buffer() {
347 wuffs_base__token_buffer ret;
348 ret.data.ptr = NULL;
349 ret.data.len = 0;
350 ret.meta.wi = 0;
351 ret.meta.ri = 0;
352 ret.meta.pos = 0;
353 ret.meta.closed = false;
354 return ret;
355}
356
357static inline wuffs_base__token_buffer_meta //
358wuffs_base__empty_token_buffer_meta() {
359 wuffs_base__token_buffer_meta ret;
360 ret.wi = 0;
361 ret.ri = 0;
362 ret.pos = 0;
363 ret.closed = false;
364 return ret;
365}
366
367// wuffs_base__token_buffer__compact moves any written but unread tokens to the
368// start of the buffer.
369static inline void //
370wuffs_base__token_buffer__compact(wuffs_base__token_buffer* buf) {
371 if (!buf || (buf->meta.ri == 0)) {
372 return;
373 }
374 buf->meta.pos = wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri);
375 size_t n = buf->meta.wi - buf->meta.ri;
376 if (n != 0) {
377 memmove(buf->data.ptr, buf->data.ptr + buf->meta.ri,
378 n * sizeof(wuffs_base__token));
379 }
380 buf->meta.wi = n;
381 buf->meta.ri = 0;
382}
383
384static inline uint64_t //
385wuffs_base__token_buffer__reader_available(
386 const wuffs_base__token_buffer* buf) {
387 return buf ? buf->meta.wi - buf->meta.ri : 0;
388}
389
390static inline uint64_t //
391wuffs_base__token_buffer__reader_token_position(
392 const wuffs_base__token_buffer* buf) {
393 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri) : 0;
394}
395
396static inline uint64_t //
397wuffs_base__token_buffer__writer_available(
398 const wuffs_base__token_buffer* buf) {
399 return buf ? buf->data.len - buf->meta.wi : 0;
400}
401
402static inline uint64_t //
403wuffs_base__token_buffer__writer_token_position(
404 const wuffs_base__token_buffer* buf) {
405 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.wi) : 0;
406}
407
408#ifdef __cplusplus
409
410inline void //
411wuffs_base__token_buffer::compact() {
412 wuffs_base__token_buffer__compact(this);
413}
414
415inline uint64_t //
416wuffs_base__token_buffer::reader_available() const {
417 return wuffs_base__token_buffer__reader_available(this);
418}
419
420inline uint64_t //
421wuffs_base__token_buffer::reader_token_position() const {
422 return wuffs_base__token_buffer__reader_token_position(this);
423}
424
425inline uint64_t //
426wuffs_base__token_buffer::writer_available() const {
427 return wuffs_base__token_buffer__writer_available(this);
428}
429
430inline uint64_t //
431wuffs_base__token_buffer::writer_token_position() const {
432 return wuffs_base__token_buffer__writer_token_position(this);
433}
434
435#endif // __cplusplus