blob: eb05a7604ebc25ab87ed4c8508a359377f53c800 [file] [log] [blame]
Nigel Tao737e31f2020-02-11 11:23:17 +11001// After editing this file, run "go generate" in the parent directory.
2
3// Copyright 2020 The Wuffs Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// https://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17// ---------------- Tokens
18
Nigel Tao478d1b82020-04-08 23:03:51 +100019// wuffs_base__token is an element of a byte stream's tokenization.
20//
21// See https://github.com/google/wuffs/blob/master/doc/note/tokens.md
Nigel Tao737e31f2020-02-11 11:23:17 +110022typedef struct {
Nigel Tao737e31f2020-02-11 11:23:17 +110023 uint64_t repr;
Nigel Tao36857982020-02-12 11:33:13 +110024
25#ifdef __cplusplus
Nigel Tao462f8662020-04-01 23:01:51 +110026 inline int64_t value() const;
27 inline int64_t value_extension() const;
28 inline int64_t value_major() const;
29 inline int64_t value_base_category() const;
Nigel Tao36857982020-02-12 11:33:13 +110030 inline uint64_t value_minor() const;
Nigel Tao36857982020-02-12 11:33:13 +110031 inline uint64_t value_base_detail() const;
Nigel Tao496e88b2020-04-09 22:10:08 +100032 inline bool continued() const;
Nigel Tao36857982020-02-12 11:33:13 +110033 inline uint64_t length() const;
34#endif // __cplusplus
35
Nigel Tao737e31f2020-02-11 11:23:17 +110036} wuffs_base__token;
37
38static inline wuffs_base__token //
39wuffs_base__make_token(uint64_t repr) {
40 wuffs_base__token ret;
41 ret.repr = repr;
42 return ret;
43}
44
Nigel Tao9d4eeb72020-02-26 11:36:30 +110045 // --------
46
Nigel Tao43f0a872020-03-02 13:59:46 +110047#define WUFFS_BASE__TOKEN__LENGTH__MAX_INCL 0xFFFF
48
Nigel Tao496e88b2020-04-09 22:10:08 +100049#define WUFFS_BASE__TOKEN__VALUE__SHIFT 17
50#define WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT 17
Nigel Taod1c928a2020-02-28 12:43:53 +110051#define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42
Nigel Tao496e88b2020-04-09 22:10:08 +100052#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 17
53#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 38
54#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 17
55#define WUFFS_BASE__TOKEN__CONTINUED__SHIFT 16
Nigel Taoa9d14882020-02-25 12:12:31 +110056#define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0
57
Nigel Tao9d4eeb72020-02-26 11:36:30 +110058 // --------
59
Nigel Taoa9d14882020-02-25 12:12:31 +110060#define WUFFS_BASE__TOKEN__VBC__FILLER 0
Nigel Tao85fba7f2020-02-29 16:28:06 +110061#define WUFFS_BASE__TOKEN__VBC__STRUCTURE 1
62#define WUFFS_BASE__TOKEN__VBC__STRING 2
63#define WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT 3
64#define WUFFS_BASE__TOKEN__VBC__LITERAL 4
65#define WUFFS_BASE__TOKEN__VBC__NUMBER 5
Nigel Taoa9d14882020-02-25 12:12:31 +110066
Nigel Taod1c928a2020-02-28 12:43:53 +110067 // --------
Nigel Tao9d4eeb72020-02-26 11:36:30 +110068
Nigel Taod1c928a2020-02-28 12:43:53 +110069#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE 0x00001
70#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_BLOCK 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +110071
Nigel Tao85fba7f2020-02-29 16:28:06 +110072 // --------
73
74#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH 0x00001
75#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP 0x00002
76#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_NONE 0x00010
77#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST 0x00020
78#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_DICT 0x00040
79#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_NONE 0x01000
80#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST 0x02000
81#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT 0x04000
82
Nigel Tao9d4eeb72020-02-26 11:36:30 +110083// --------
84
Nigel Taod1c928a2020-02-28 12:43:53 +110085// "DEFINITELY_FOO" means that the destination bytes (and also the source
86// bytes, for 1_DST_1_SRC_COPY) are in the FOO format. Definitely means that
87// the lack of the bit is conservative: it is valid for all-ASCII strings to
88// have neither DEFINITELY_UTF_8 or DEFINITELY_ASCII bits set.
89#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_UTF_8 0x00001
90#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_ASCII 0x00002
Nigel Taoa9d14882020-02-25 12:12:31 +110091
Nigel Tao9d4eeb72020-02-26 11:36:30 +110092// "CONVERT_D_DST_S_SRC" means that multiples of S source bytes (possibly
93// padded) produces multiples of D destination bytes. For example,
94// CONVERT_1_DST_4_SRC_BACKSLASH_X means a source like "\\x23\\x67\\xAB", where
95// 12 src bytes encode 3 dst bytes.
96//
Nigel Tao478d1b82020-04-08 23:03:51 +100097// Post-processing may further transform those D destination bytes (e.g. treat
98// "\\xFF" as the Unicode code point U+00FF instead of the byte 0xFF), but that
99// is out of scope of this VBD's semantics.
100//
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100101// When src is the empty string, multiple conversion algorithms are applicable
102// (so these bits are not necessarily mutually exclusive), all producing the
103// same empty dst string.
Nigel Taod1c928a2020-02-28 12:43:53 +1100104#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP 0x00010
105#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY 0x00020
106#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_2_SRC_HEXADECIMAL 0x00040
107#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_4_SRC_BACKSLASH_X 0x00080
108#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_STD 0x00100
109#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_URL 0x00200
110#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_4_DST_5_SRC_ASCII_85 0x00400
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100111
112 // --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100113
Nigel Tao85fba7f2020-02-29 16:28:06 +1100114#define WUFFS_BASE__TOKEN__VBD__LITERAL__UNDEFINED 0x00001
115#define WUFFS_BASE__TOKEN__VBD__LITERAL__NULL 0x00002
116#define WUFFS_BASE__TOKEN__VBD__LITERAL__FALSE 0x00004
117#define WUFFS_BASE__TOKEN__VBD__LITERAL__TRUE 0x00008
118
119 // --------
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100120
Nigel Taoa9d14882020-02-25 12:12:31 +1100121// For a source string of "123" or "0x9A", it is valid for a tokenizer to
122// return any one of:
Nigel Taoc3ca77a2020-03-11 12:06:50 +1100123// - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT.
124// - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED.
125// - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_UNSIGNED.
Nigel Taoa9d14882020-02-25 12:12:31 +1100126//
127// For a source string of "+123" or "-0x9A", only the first two are valid.
128//
129// For a source string of "123.", only the first one is valid.
Nigel Taoc3ca77a2020-03-11 12:06:50 +1100130#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT 0x00001
131#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED 0x00002
132#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_UNSIGNED 0x00004
133
Nigel Taod7c72372020-03-24 13:58:38 +1100134#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_NEG_INF 0x00010
135#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_POS_INF 0x00020
136#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_NEG_NAN 0x00040
137#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_POS_NAN 0x00080
138
Nigel Taoc3ca77a2020-03-11 12:06:50 +1100139// The number 300 might be represented as "\x01\x2C", "\x2C\x01\x00\x00" or
140// "300", which are big-endian, little-endian or text. For binary formats, the
141// token length discriminates e.g. u16 little-endian vs u32 little-endian.
142#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_BINARY_BIG_ENDIAN 0x00100
143#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_BINARY_LITTLE_ENDIAN 0x00200
144#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_TEXT 0x00400
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100145
146// --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100147
Nigel Tao462f8662020-04-01 23:01:51 +1100148// wuffs_base__token__value returns the token's high 46 bits, sign-extended. A
149// negative value means an extended token, non-negative means a simple token.
150static inline int64_t //
Nigel Tao36857982020-02-12 11:33:13 +1100151wuffs_base__token__value(const wuffs_base__token* t) {
Nigel Tao462f8662020-04-01 23:01:51 +1100152 return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE__SHIFT;
Nigel Tao36857982020-02-12 11:33:13 +1100153}
154
Nigel Tao462f8662020-04-01 23:01:51 +1100155// wuffs_base__token__value_extension returns a negative value if the token was
156// not an extended token.
157static inline int64_t //
158wuffs_base__token__value_extension(const wuffs_base__token* t) {
159 return (~(int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT;
160}
161
162// wuffs_base__token__value_major returns a negative value if the token was not
163// a simple token.
164static inline int64_t //
Nigel Tao36857982020-02-12 11:33:13 +1100165wuffs_base__token__value_major(const wuffs_base__token* t) {
Nigel Tao462f8662020-04-01 23:01:51 +1100166 return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT;
167}
168
169// wuffs_base__token__value_base_category returns a negative value if the token
170// was not a simple token.
171static inline int64_t //
172wuffs_base__token__value_base_category(const wuffs_base__token* t) {
173 return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT;
Nigel Tao36857982020-02-12 11:33:13 +1100174}
175
176static inline uint64_t //
177wuffs_base__token__value_minor(const wuffs_base__token* t) {
Nigel Tao496e88b2020-04-09 22:10:08 +1000178 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) & 0x1FFFFFF;
Nigel Tao36857982020-02-12 11:33:13 +1100179}
180
181static inline uint64_t //
182wuffs_base__token__value_base_detail(const wuffs_base__token* t) {
Nigel Tao462f8662020-04-01 23:01:51 +1100183 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) & 0x1FFFFF;
Nigel Tao36857982020-02-12 11:33:13 +1100184}
185
Nigel Taod1c928a2020-02-28 12:43:53 +1100186static inline bool //
Nigel Tao496e88b2020-04-09 22:10:08 +1000187wuffs_base__token__continued(const wuffs_base__token* t) {
188 return t->repr & 0x10000;
Nigel Taod1c928a2020-02-28 12:43:53 +1100189}
190
Nigel Tao36857982020-02-12 11:33:13 +1100191static inline uint64_t //
192wuffs_base__token__length(const wuffs_base__token* t) {
Nigel Tao462f8662020-04-01 23:01:51 +1100193 return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) & 0xFFFF;
Nigel Tao36857982020-02-12 11:33:13 +1100194}
195
196#ifdef __cplusplus
197
Nigel Tao462f8662020-04-01 23:01:51 +1100198inline int64_t //
Nigel Tao36857982020-02-12 11:33:13 +1100199wuffs_base__token::value() const {
200 return wuffs_base__token__value(this);
201}
202
Nigel Tao462f8662020-04-01 23:01:51 +1100203inline int64_t //
204wuffs_base__token::value_extension() const {
205 return wuffs_base__token__value_extension(this);
206}
207
208inline int64_t //
Nigel Tao36857982020-02-12 11:33:13 +1100209wuffs_base__token::value_major() const {
210 return wuffs_base__token__value_major(this);
211}
212
Nigel Tao462f8662020-04-01 23:01:51 +1100213inline int64_t //
214wuffs_base__token::value_base_category() const {
215 return wuffs_base__token__value_base_category(this);
216}
217
Nigel Tao36857982020-02-12 11:33:13 +1100218inline uint64_t //
219wuffs_base__token::value_minor() const {
220 return wuffs_base__token__value_minor(this);
221}
222
223inline uint64_t //
Nigel Tao36857982020-02-12 11:33:13 +1100224wuffs_base__token::value_base_detail() const {
225 return wuffs_base__token__value_base_detail(this);
226}
227
Nigel Taod1c928a2020-02-28 12:43:53 +1100228inline bool //
Nigel Tao496e88b2020-04-09 22:10:08 +1000229wuffs_base__token::continued() const {
230 return wuffs_base__token__continued(this);
Nigel Taod1c928a2020-02-28 12:43:53 +1100231}
232
Nigel Tao36857982020-02-12 11:33:13 +1100233inline uint64_t //
234wuffs_base__token::length() const {
235 return wuffs_base__token__length(this);
236}
237
238#endif // __cplusplus
239
240// --------
241
Nigel Tao737e31f2020-02-11 11:23:17 +1100242typedef WUFFS_BASE__SLICE(wuffs_base__token) wuffs_base__slice_token;
243
244static inline wuffs_base__slice_token //
245wuffs_base__make_slice_token(wuffs_base__token* ptr, size_t len) {
246 wuffs_base__slice_token ret;
247 ret.ptr = ptr;
248 ret.len = len;
249 return ret;
250}
251
Nigel Tao36857982020-02-12 11:33:13 +1100252// --------
253
Nigel Tao737e31f2020-02-11 11:23:17 +1100254// wuffs_base__token_buffer_meta is the metadata for a
255// wuffs_base__token_buffer's data.
256typedef struct {
257 size_t wi; // Write index. Invariant: wi <= len.
258 size_t ri; // Read index. Invariant: ri <= wi.
259 uint64_t pos; // Position of the buffer start relative to the stream start.
260 bool closed; // No further writes are expected.
261} wuffs_base__token_buffer_meta;
262
263// wuffs_base__token_buffer is a 1-dimensional buffer (a pointer and length)
264// plus additional metadata.
265//
266// A value with all fields zero is a valid, empty buffer.
267typedef struct {
268 wuffs_base__slice_token data;
269 wuffs_base__token_buffer_meta meta;
270
271#ifdef __cplusplus
Nigel Tao9fd96e82020-03-16 21:46:21 +1100272 inline bool is_valid() const;
Nigel Tao737e31f2020-02-11 11:23:17 +1100273 inline void compact();
274 inline uint64_t reader_available() const;
275 inline uint64_t reader_token_position() const;
276 inline uint64_t writer_available() const;
277 inline uint64_t writer_token_position() const;
278#endif // __cplusplus
279
280} wuffs_base__token_buffer;
281
282static inline wuffs_base__token_buffer //
283wuffs_base__make_token_buffer(wuffs_base__slice_token data,
284 wuffs_base__token_buffer_meta meta) {
285 wuffs_base__token_buffer ret;
286 ret.data = data;
287 ret.meta = meta;
288 return ret;
289}
290
291static inline wuffs_base__token_buffer_meta //
292wuffs_base__make_token_buffer_meta(size_t wi,
293 size_t ri,
294 uint64_t pos,
295 bool closed) {
296 wuffs_base__token_buffer_meta ret;
297 ret.wi = wi;
298 ret.ri = ri;
299 ret.pos = pos;
300 ret.closed = closed;
301 return ret;
302}
303
304static inline wuffs_base__token_buffer //
Nigel Tao64dbd002020-04-02 22:11:42 +1100305wuffs_base__slice_token__reader(wuffs_base__slice_token s, bool closed) {
Nigel Tao9fd96e82020-03-16 21:46:21 +1100306 wuffs_base__token_buffer ret;
307 ret.data.ptr = s.ptr;
308 ret.data.len = s.len;
309 ret.meta.wi = s.len;
310 ret.meta.ri = 0;
311 ret.meta.pos = 0;
312 ret.meta.closed = closed;
313 return ret;
314}
315
316static inline wuffs_base__token_buffer //
Nigel Tao64dbd002020-04-02 22:11:42 +1100317wuffs_base__slice_token__writer(wuffs_base__slice_token s) {
Nigel Tao9fd96e82020-03-16 21:46:21 +1100318 wuffs_base__token_buffer ret;
319 ret.data.ptr = s.ptr;
320 ret.data.len = s.len;
321 ret.meta.wi = 0;
322 ret.meta.ri = 0;
323 ret.meta.pos = 0;
324 ret.meta.closed = false;
325 return ret;
326}
327
328static inline wuffs_base__token_buffer //
Nigel Tao737e31f2020-02-11 11:23:17 +1100329wuffs_base__empty_token_buffer() {
330 wuffs_base__token_buffer ret;
331 ret.data.ptr = NULL;
332 ret.data.len = 0;
333 ret.meta.wi = 0;
334 ret.meta.ri = 0;
335 ret.meta.pos = 0;
336 ret.meta.closed = false;
337 return ret;
338}
339
340static inline wuffs_base__token_buffer_meta //
341wuffs_base__empty_token_buffer_meta() {
342 wuffs_base__token_buffer_meta ret;
343 ret.wi = 0;
344 ret.ri = 0;
345 ret.pos = 0;
346 ret.closed = false;
347 return ret;
348}
349
Nigel Tao9fd96e82020-03-16 21:46:21 +1100350static inline bool //
351wuffs_base__token_buffer__is_valid(const wuffs_base__token_buffer* buf) {
352 if (buf) {
353 if (buf->data.ptr) {
354 return (buf->meta.ri <= buf->meta.wi) && (buf->meta.wi <= buf->data.len);
355 } else {
356 return (buf->meta.ri == 0) && (buf->meta.wi == 0) && (buf->data.len == 0);
357 }
358 }
359 return false;
360}
361
Nigel Tao737e31f2020-02-11 11:23:17 +1100362// wuffs_base__token_buffer__compact moves any written but unread tokens to the
363// start of the buffer.
364static inline void //
365wuffs_base__token_buffer__compact(wuffs_base__token_buffer* buf) {
366 if (!buf || (buf->meta.ri == 0)) {
367 return;
368 }
369 buf->meta.pos = wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri);
370 size_t n = buf->meta.wi - buf->meta.ri;
371 if (n != 0) {
372 memmove(buf->data.ptr, buf->data.ptr + buf->meta.ri,
373 n * sizeof(wuffs_base__token));
374 }
375 buf->meta.wi = n;
376 buf->meta.ri = 0;
377}
378
379static inline uint64_t //
380wuffs_base__token_buffer__reader_available(
381 const wuffs_base__token_buffer* buf) {
382 return buf ? buf->meta.wi - buf->meta.ri : 0;
383}
384
385static inline uint64_t //
386wuffs_base__token_buffer__reader_token_position(
387 const wuffs_base__token_buffer* buf) {
388 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri) : 0;
389}
390
391static inline uint64_t //
392wuffs_base__token_buffer__writer_available(
393 const wuffs_base__token_buffer* buf) {
394 return buf ? buf->data.len - buf->meta.wi : 0;
395}
396
397static inline uint64_t //
398wuffs_base__token_buffer__writer_token_position(
399 const wuffs_base__token_buffer* buf) {
400 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.wi) : 0;
401}
402
403#ifdef __cplusplus
404
Nigel Tao9fd96e82020-03-16 21:46:21 +1100405inline bool //
406wuffs_base__token_buffer::is_valid() const {
407 return wuffs_base__token_buffer__is_valid(this);
408}
409
Nigel Tao737e31f2020-02-11 11:23:17 +1100410inline void //
411wuffs_base__token_buffer::compact() {
412 wuffs_base__token_buffer__compact(this);
413}
414
415inline uint64_t //
416wuffs_base__token_buffer::reader_available() const {
417 return wuffs_base__token_buffer__reader_available(this);
418}
419
420inline uint64_t //
421wuffs_base__token_buffer::reader_token_position() const {
422 return wuffs_base__token_buffer__reader_token_position(this);
423}
424
425inline uint64_t //
426wuffs_base__token_buffer::writer_available() const {
427 return wuffs_base__token_buffer__writer_available(this);
428}
429
430inline uint64_t //
431wuffs_base__token_buffer::writer_token_position() const {
432 return wuffs_base__token_buffer__writer_token_position(this);
433}
434
435#endif // __cplusplus