blob: b34f073a000b17542d6558d73235cb4d6cc55db9 [file] [log] [blame]
Nigel Tao737e31f2020-02-11 11:23:17 +11001// After editing this file, run "go generate" in the parent directory.
2
3// Copyright 2020 The Wuffs Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// https://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17// ---------------- Tokens
18
Nigel Tao737e31f2020-02-11 11:23:17 +110019typedef struct {
Nigel Taod1c928a2020-02-28 12:43:53 +110020 // The repr's 64 bits are divided as:
21 //
22 // +-----+-------------+-------+-------------+-----+-----+-----------+
23 // | 1 | 21 | 3 | 21 | 1 | 1 | 16 |
24 // +-----+-------------+-------+-------------+-----+-----+-----------+
25 // [..................value..................] LP LN length
26 // [..0..|.value_major.|.....value_minor.....]
27 // [..0..|.........VBC.........|.....VBD.....]
28 //
29 // The broad divisions are:
30 // - Bits 63 .. 18 (46 bits) is the value.
31 // - Bits 17 .. 16 ( 2 bits) is LP and LN (link_prev and link_next).
Nigel Taoa9d14882020-02-25 12:12:31 +110032 // - Bits 15 .. 0 (16 bits) is the length.
33 //
Nigel Taod1c928a2020-02-28 12:43:53 +110034 // The value bits can be sub-divided in multiple ways:
35 // - Bits 63 .. 63 ( 1 bits) is reserved (a zero bit).
36 // - Bits 62 .. 42 (21 bits) is the value_major.
37 // - Bits 41 .. 18 (24 bits) is the value_minor.
38 // - Bits 62 .. 39 (24 bits) is the VBC (value_base_category).
39 // - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).
Nigel Taoa9d14882020-02-25 12:12:31 +110040 //
Nigel Taod1c928a2020-02-28 12:43:53 +110041 // The value_major is a 21-bit [Base38](doc/note/base38-and-fourcc.md) value.
42 // If all of its bits are zero (special cased for Wuffs' built-in "base"
43 // package) then the value_minor is further sub-divided:
44 // - Bits 41 .. 39 ( 3 bits) is the VBC (value_base_category).
45 // - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).
Nigel Taoa9d14882020-02-25 12:12:31 +110046 //
Nigel Taod1c928a2020-02-28 12:43:53 +110047 // The high 46 bits (bits 63 .. 18) only have VBC and VBD semantics when the
48 // high 22 bits (the value_major) are all zero. An equivalent test is that
49 // the high 25 bits (the notional VBC) has a numerical value less than 8.
50 //
51 // At 21 bits, the VBD can hold every valid Unicode code point.
52 //
53 // If value_major is non-zero then value_minor has whatever arbitrary meaning
54 // the tokenizer's package assigns to it.
55 //
56 // Multiple consecutive tokens can form a larger conceptual unit. For
57 // example, an "abc\tz" string is a single higher level concept but at the
58 // lower level, it could consist of multiple tokens: the quotes '"', the
59 // ASCII texts "abc" and "z" and the backslash-escaped tab '\t'. The LP and
60 // LN (link_prev and link_next) bits denote tokens that are part of a
61 // multi-token chain:
62 // - LP means that this token is not the first (there is a previous token).
63 // - LN means that this token is not the last (there is a next token).
64 //
65 // In particular, a stand-alone token will have both link bits set to zero.
Nigel Tao737e31f2020-02-11 11:23:17 +110066 uint64_t repr;
Nigel Tao36857982020-02-12 11:33:13 +110067
68#ifdef __cplusplus
69 inline uint64_t value() const;
70 inline uint64_t value_major() const;
71 inline uint64_t value_minor() const;
72 inline uint64_t value_base_category() const;
73 inline uint64_t value_base_detail() const;
Nigel Taod1c928a2020-02-28 12:43:53 +110074 inline bool link_prev() const;
75 inline bool link_next() const;
Nigel Tao36857982020-02-12 11:33:13 +110076 inline uint64_t length() const;
77#endif // __cplusplus
78
Nigel Tao737e31f2020-02-11 11:23:17 +110079} wuffs_base__token;
80
81static inline wuffs_base__token //
82wuffs_base__make_token(uint64_t repr) {
83 wuffs_base__token ret;
84 ret.repr = repr;
85 return ret;
86}
87
Nigel Tao9d4eeb72020-02-26 11:36:30 +110088 // --------
89
Nigel Taod1c928a2020-02-28 12:43:53 +110090#define WUFFS_BASE__TOKEN__VALUE__MASK 0x3FFFFFFFFFFF
91#define WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK 0x3FFFFF
Nigel Taoa9d14882020-02-25 12:12:31 +110092#define WUFFS_BASE__TOKEN__VALUE_MINOR__MASK 0xFFFFFF
Nigel Taod1c928a2020-02-28 12:43:53 +110093#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK 0x1FFFFFF
Nigel Taoa9d14882020-02-25 12:12:31 +110094#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK 0x1FFFFF
Nigel Taod1c928a2020-02-28 12:43:53 +110095#define WUFFS_BASE__TOKEN__LINK__MASK 0x3
Nigel Taoa9d14882020-02-25 12:12:31 +110096#define WUFFS_BASE__TOKEN__LENGTH__MASK 0xFFFF
97
Nigel Taod1c928a2020-02-28 12:43:53 +110098#define WUFFS_BASE__TOKEN__VALUE__SHIFT 18
99#define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42
100#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18
101#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 39
102#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 18
103#define WUFFS_BASE__TOKEN__LINK__SHIFT 16
Nigel Taoa9d14882020-02-25 12:12:31 +1100104#define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0
105
Nigel Taod1c928a2020-02-28 12:43:53 +1100106#define WUFFS_BASE__TOKEN__LINK_PREV 0x20000
107#define WUFFS_BASE__TOKEN__LINK_NEXT 0x10000
108
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100109 // --------
110
Nigel Taoa9d14882020-02-25 12:12:31 +1100111#define WUFFS_BASE__TOKEN__VBC__FILLER 0
112#define WUFFS_BASE__TOKEN__VBC__STRING 1
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100113#define WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT 2
114#define WUFFS_BASE__TOKEN__VBC__NUMBER 3
115#define WUFFS_BASE__TOKEN__VBC__STRUCTURE 4
Nigel Taoa9d14882020-02-25 12:12:31 +1100116
Nigel Taod1c928a2020-02-28 12:43:53 +1100117 // --------
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100118
Nigel Taod1c928a2020-02-28 12:43:53 +1100119#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE 0x00001
120#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_BLOCK 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +1100121
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100122// --------
123
Nigel Taod1c928a2020-02-28 12:43:53 +1100124// "DEFINITELY_FOO" means that the destination bytes (and also the source
125// bytes, for 1_DST_1_SRC_COPY) are in the FOO format. Definitely means that
126// the lack of the bit is conservative: it is valid for all-ASCII strings to
127// have neither DEFINITELY_UTF_8 or DEFINITELY_ASCII bits set.
128#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_UTF_8 0x00001
129#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_ASCII 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +1100130
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100131// "CONVERT_D_DST_S_SRC" means that multiples of S source bytes (possibly
132// padded) produces multiples of D destination bytes. For example,
133// CONVERT_1_DST_4_SRC_BACKSLASH_X means a source like "\\x23\\x67\\xAB", where
134// 12 src bytes encode 3 dst bytes.
135//
136// When src is the empty string, multiple conversion algorithms are applicable
137// (so these bits are not necessarily mutually exclusive), all producing the
138// same empty dst string.
Nigel Taod1c928a2020-02-28 12:43:53 +1100139#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP 0x00010
140#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY 0x00020
141#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_2_SRC_HEXADECIMAL 0x00040
142#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_4_SRC_BACKSLASH_X 0x00080
143#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_STD 0x00100
144#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_URL 0x00200
145#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_4_DST_5_SRC_ASCII_85 0x00400
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100146
147 // --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100148
149#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL 0x00001
150#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__UNDEFINED 0x00101
151#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__NULL 0x00201
152#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__FALSE 0x00401
153#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__TRUE 0x00801
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100154
Nigel Taoa9d14882020-02-25 12:12:31 +1100155// For a source string of "123" or "0x9A", it is valid for a tokenizer to
156// return any one of:
157// - WUFFS_BASE__TOKEN__VBD__NUMBER__FLOATING_POINT.
158// - WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_SIGNED.
159// - WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_UNSIGNED.
160//
161// For a source string of "+123" or "-0x9A", only the first two are valid.
162//
163// For a source string of "123.", only the first one is valid.
164#define WUFFS_BASE__TOKEN__VBD__NUMBER__FLOATING_POINT 0x00002
165#define WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_SIGNED 0x00004
166#define WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_UNSIGNED 0x00008
167
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100168 // --------
169
170#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH 0x00001
171#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP 0x00002
172#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_NONE 0x00010
173#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST 0x00020
174#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_DICT 0x00040
175#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_NONE 0x01000
176#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST 0x02000
177#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT 0x04000
178
179// --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100180
Nigel Tao36857982020-02-12 11:33:13 +1100181static inline uint64_t //
182wuffs_base__token__value(const wuffs_base__token* t) {
183 return (t->repr >> WUFFS_BASE__TOKEN__VALUE__SHIFT) &
184 WUFFS_BASE__TOKEN__VALUE__MASK;
185}
186
187static inline uint64_t //
188wuffs_base__token__value_major(const wuffs_base__token* t) {
189 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT) &
190 WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK;
191}
192
193static inline uint64_t //
194wuffs_base__token__value_minor(const wuffs_base__token* t) {
195 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) &
196 WUFFS_BASE__TOKEN__VALUE_MINOR__MASK;
197}
198
199static inline uint64_t //
200wuffs_base__token__value_base_category(const wuffs_base__token* t) {
201 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT) &
202 WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK;
203}
204
205static inline uint64_t //
206wuffs_base__token__value_base_detail(const wuffs_base__token* t) {
207 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) &
208 WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK;
209}
210
Nigel Taod1c928a2020-02-28 12:43:53 +1100211static inline bool //
212wuffs_base__token__link_prev(const wuffs_base__token* t) {
213 return t->repr & WUFFS_BASE__TOKEN__LINK_PREV;
214}
215
216static inline bool //
217wuffs_base__token__link_next(const wuffs_base__token* t) {
218 return t->repr & WUFFS_BASE__TOKEN__LINK_NEXT;
219}
220
Nigel Tao36857982020-02-12 11:33:13 +1100221static inline uint64_t //
222wuffs_base__token__length(const wuffs_base__token* t) {
223 return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) &
224 WUFFS_BASE__TOKEN__LENGTH__MASK;
225}
226
227#ifdef __cplusplus
228
229inline uint64_t //
230wuffs_base__token::value() const {
231 return wuffs_base__token__value(this);
232}
233
234inline uint64_t //
235wuffs_base__token::value_major() const {
236 return wuffs_base__token__value_major(this);
237}
238
239inline uint64_t //
240wuffs_base__token::value_minor() const {
241 return wuffs_base__token__value_minor(this);
242}
243
244inline uint64_t //
245wuffs_base__token::value_base_category() const {
246 return wuffs_base__token__value_base_category(this);
247}
248
249inline uint64_t //
250wuffs_base__token::value_base_detail() const {
251 return wuffs_base__token__value_base_detail(this);
252}
253
Nigel Taod1c928a2020-02-28 12:43:53 +1100254inline bool //
255wuffs_base__token::link_prev() const {
256 return wuffs_base__token__link_prev(this);
257}
258
259inline bool //
260wuffs_base__token::link_next() const {
261 return wuffs_base__token__link_next(this);
262}
263
Nigel Tao36857982020-02-12 11:33:13 +1100264inline uint64_t //
265wuffs_base__token::length() const {
266 return wuffs_base__token__length(this);
267}
268
269#endif // __cplusplus
270
271// --------
272
Nigel Tao737e31f2020-02-11 11:23:17 +1100273typedef WUFFS_BASE__SLICE(wuffs_base__token) wuffs_base__slice_token;
274
275static inline wuffs_base__slice_token //
276wuffs_base__make_slice_token(wuffs_base__token* ptr, size_t len) {
277 wuffs_base__slice_token ret;
278 ret.ptr = ptr;
279 ret.len = len;
280 return ret;
281}
282
Nigel Tao36857982020-02-12 11:33:13 +1100283// --------
284
Nigel Tao737e31f2020-02-11 11:23:17 +1100285// wuffs_base__token_buffer_meta is the metadata for a
286// wuffs_base__token_buffer's data.
287typedef struct {
288 size_t wi; // Write index. Invariant: wi <= len.
289 size_t ri; // Read index. Invariant: ri <= wi.
290 uint64_t pos; // Position of the buffer start relative to the stream start.
291 bool closed; // No further writes are expected.
292} wuffs_base__token_buffer_meta;
293
294// wuffs_base__token_buffer is a 1-dimensional buffer (a pointer and length)
295// plus additional metadata.
296//
297// A value with all fields zero is a valid, empty buffer.
298typedef struct {
299 wuffs_base__slice_token data;
300 wuffs_base__token_buffer_meta meta;
301
302#ifdef __cplusplus
303 inline void compact();
304 inline uint64_t reader_available() const;
305 inline uint64_t reader_token_position() const;
306 inline uint64_t writer_available() const;
307 inline uint64_t writer_token_position() const;
308#endif // __cplusplus
309
310} wuffs_base__token_buffer;
311
312static inline wuffs_base__token_buffer //
313wuffs_base__make_token_buffer(wuffs_base__slice_token data,
314 wuffs_base__token_buffer_meta meta) {
315 wuffs_base__token_buffer ret;
316 ret.data = data;
317 ret.meta = meta;
318 return ret;
319}
320
321static inline wuffs_base__token_buffer_meta //
322wuffs_base__make_token_buffer_meta(size_t wi,
323 size_t ri,
324 uint64_t pos,
325 bool closed) {
326 wuffs_base__token_buffer_meta ret;
327 ret.wi = wi;
328 ret.ri = ri;
329 ret.pos = pos;
330 ret.closed = closed;
331 return ret;
332}
333
334static inline wuffs_base__token_buffer //
335wuffs_base__empty_token_buffer() {
336 wuffs_base__token_buffer ret;
337 ret.data.ptr = NULL;
338 ret.data.len = 0;
339 ret.meta.wi = 0;
340 ret.meta.ri = 0;
341 ret.meta.pos = 0;
342 ret.meta.closed = false;
343 return ret;
344}
345
346static inline wuffs_base__token_buffer_meta //
347wuffs_base__empty_token_buffer_meta() {
348 wuffs_base__token_buffer_meta ret;
349 ret.wi = 0;
350 ret.ri = 0;
351 ret.pos = 0;
352 ret.closed = false;
353 return ret;
354}
355
356// wuffs_base__token_buffer__compact moves any written but unread tokens to the
357// start of the buffer.
358static inline void //
359wuffs_base__token_buffer__compact(wuffs_base__token_buffer* buf) {
360 if (!buf || (buf->meta.ri == 0)) {
361 return;
362 }
363 buf->meta.pos = wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri);
364 size_t n = buf->meta.wi - buf->meta.ri;
365 if (n != 0) {
366 memmove(buf->data.ptr, buf->data.ptr + buf->meta.ri,
367 n * sizeof(wuffs_base__token));
368 }
369 buf->meta.wi = n;
370 buf->meta.ri = 0;
371}
372
373static inline uint64_t //
374wuffs_base__token_buffer__reader_available(
375 const wuffs_base__token_buffer* buf) {
376 return buf ? buf->meta.wi - buf->meta.ri : 0;
377}
378
379static inline uint64_t //
380wuffs_base__token_buffer__reader_token_position(
381 const wuffs_base__token_buffer* buf) {
382 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri) : 0;
383}
384
385static inline uint64_t //
386wuffs_base__token_buffer__writer_available(
387 const wuffs_base__token_buffer* buf) {
388 return buf ? buf->data.len - buf->meta.wi : 0;
389}
390
391static inline uint64_t //
392wuffs_base__token_buffer__writer_token_position(
393 const wuffs_base__token_buffer* buf) {
394 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.wi) : 0;
395}
396
397#ifdef __cplusplus
398
399inline void //
400wuffs_base__token_buffer::compact() {
401 wuffs_base__token_buffer__compact(this);
402}
403
404inline uint64_t //
405wuffs_base__token_buffer::reader_available() const {
406 return wuffs_base__token_buffer__reader_available(this);
407}
408
409inline uint64_t //
410wuffs_base__token_buffer::reader_token_position() const {
411 return wuffs_base__token_buffer__reader_token_position(this);
412}
413
414inline uint64_t //
415wuffs_base__token_buffer::writer_available() const {
416 return wuffs_base__token_buffer__writer_available(this);
417}
418
419inline uint64_t //
420wuffs_base__token_buffer::writer_token_position() const {
421 return wuffs_base__token_buffer__writer_token_position(this);
422}
423
424#endif // __cplusplus