blob: 389cab650ca3ffa716a3ac65eb1269f4d0b6d3ed [file] [log] [blame]
Nigel Tao737e31f2020-02-11 11:23:17 +11001// After editing this file, run "go generate" in the parent directory.
2
3// Copyright 2020 The Wuffs Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// https://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17// ---------------- Tokens
18
Nigel Tao737e31f2020-02-11 11:23:17 +110019typedef struct {
Nigel Taoa9d14882020-02-25 12:12:31 +110020 // The repr is divided as:
21 // - Bits 63 .. 40 (24 bits) is the major value.
22 // - Bits 39 .. 16 (24 bits) is the minor value.
23 // - Bits 15 .. 0 (16 bits) is the length.
24 //
25 // The major value is a [Base38](doc/note/base38-and-fourcc.md) value. If
26 // zero (special cased for Wuffs' built-in "base" package) then the minor
27 // value is further sub-divided:
28 // - Bits 39 .. 37 ( 3 bits) is the value_base_category.
29 // - Bits 36 .. 16 (21 bits) is the value_base_detail.
30 //
31 // In particular, at 21 bits, the value_base_detail can hold every valid
32 // Unicode code point.
33 //
34 // If the major value is non-zero then the minor value has whatever arbitrary
35 // meaning the tokenizer's package assigns to it.
Nigel Tao737e31f2020-02-11 11:23:17 +110036 uint64_t repr;
Nigel Tao36857982020-02-12 11:33:13 +110037
38#ifdef __cplusplus
39 inline uint64_t value() const;
40 inline uint64_t value_major() const;
41 inline uint64_t value_minor() const;
42 inline uint64_t value_base_category() const;
43 inline uint64_t value_base_detail() const;
44 inline uint64_t length() const;
45#endif // __cplusplus
46
Nigel Tao737e31f2020-02-11 11:23:17 +110047} wuffs_base__token;
48
49static inline wuffs_base__token //
50wuffs_base__make_token(uint64_t repr) {
51 wuffs_base__token ret;
52 ret.repr = repr;
53 return ret;
54}
55
Nigel Tao9d4eeb72020-02-26 11:36:30 +110056 // --------
57
Nigel Taoa9d14882020-02-25 12:12:31 +110058#define WUFFS_BASE__TOKEN__VALUE__MASK 0xFFFFFFFFFFFF
59#define WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK 0xFFFFFF
60#define WUFFS_BASE__TOKEN__VALUE_MINOR__MASK 0xFFFFFF
61#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK 0x7FFFFFF
62#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK 0x1FFFFF
63#define WUFFS_BASE__TOKEN__LENGTH__MASK 0xFFFF
64
65#define WUFFS_BASE__TOKEN__VALUE__SHIFT 16
66#define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 40
67#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 16
68#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 37
69#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 16
70#define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0
71
Nigel Tao9d4eeb72020-02-26 11:36:30 +110072 // --------
73
Nigel Taoa9d14882020-02-25 12:12:31 +110074#define WUFFS_BASE__TOKEN__VBC__FILLER 0
75#define WUFFS_BASE__TOKEN__VBC__STRING 1
Nigel Tao9d4eeb72020-02-26 11:36:30 +110076#define WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT 2
77#define WUFFS_BASE__TOKEN__VBC__NUMBER 3
78#define WUFFS_BASE__TOKEN__VBC__STRUCTURE 4
Nigel Taoa9d14882020-02-25 12:12:31 +110079
Nigel Tao9d4eeb72020-02-26 11:36:30 +110080// --------
81
82// INCOMPLETE means that this token combines with the following token.
83//
84// For example, tokenizing a comment that is longer than the source buffer can
85// result in multiple VBC__FILLER tokens. All but the last one is INCOMPLETE.
86//
87// By convention, whitespace is not marked incomplete. Two whitespace tokens of
88// length 30 are equivalent to one whitespace token of length 60.
89//
Nigel Taoa9d14882020-02-25 12:12:31 +110090// Bits 0x2, 0x4 and 0x8 are reserved for flags that are common between
Nigel Tao9d4eeb72020-02-26 11:36:30 +110091// VBD_FILLER and VBD_STRING.
Nigel Taoa9d14882020-02-25 12:12:31 +110092#define WUFFS_BASE__TOKEN__VBD__FILLER__INCOMPLETE 0x00001
Nigel Tao9d4eeb72020-02-26 11:36:30 +110093
Nigel Taoa9d14882020-02-25 12:12:31 +110094#define WUFFS_BASE__TOKEN__VBD__FILLER__END_OF_CONSECUTIVE_COMMENTS 0x00010
95#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE 0x00020
96#define WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_BLOCK 0x00040
97
Nigel Tao9d4eeb72020-02-26 11:36:30 +110098// --------
99
100// INCOMPLETE means that this token combines with the following token.
101//
102// For example, tokenizing a string that is longer than the source buffer can
103// result in multiple VBC__STRING tokens. All but the last one is INCOMPLETE.
104//
Nigel Taoa9d14882020-02-25 12:12:31 +1100105// Bits 0x2, 0x4 and 0x8 are reserved for flags that are common between
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100106// VBD_FILLER and VBD_STRING.
Nigel Taoa9d14882020-02-25 12:12:31 +1100107#define WUFFS_BASE__TOKEN__VBD__STRING__INCOMPLETE 0x00001
Nigel Taoa9d14882020-02-25 12:12:31 +1100108
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100109// "DEFINITELY_FOO" means that the source bytes (and also the destination
110// bytes, assuming 1_DST_1_SRC_COPY) are in the FOO format. Definitely means
111// that the lack of the bit is conservative: it is valid for all-ASCII strings
112// to have neither DEFINITELY_ASCII or DEFINITELY_UTF_8 bits set.
113#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_ASCII 0x00010
114#define WUFFS_BASE__TOKEN__VBD__STRING__DEFINITELY_UTF_8 0x00020
Nigel Taoa9d14882020-02-25 12:12:31 +1100115
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100116// "CONVERT_D_DST_S_SRC" means that multiples of S source bytes (possibly
117// padded) produces multiples of D destination bytes. For example,
118// CONVERT_1_DST_4_SRC_BACKSLASH_X means a source like "\\x23\\x67\\xAB", where
119// 12 src bytes encode 3 dst bytes.
120//
121// When src is the empty string, multiple conversion algorithms are applicable
122// (so these bits are not necessarily mutually exclusive), all producing the
123// same empty dst string.
124#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_0_DST_1_SRC_DROP 0x00100
125#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_1_SRC_COPY 0x00200
126#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_2_SRC_HEXADECIMAL 0x00400
127#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_1_DST_4_SRC_BACKSLASH_X 0x00800
128#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_STD 0x01000
129#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_3_DST_4_SRC_BASE_64_URL 0x02000
130#define WUFFS_BASE__TOKEN__VBD__STRING__CONVERT_4_DST_5_SRC_ASCII_85 0x04000
131
132 // --------
133
134 // UNICODE_CODE_POINT tokens have no room in their VBD for an INCOMPLETE bit.
135 // All 21 bits are used to hold the Unicode code point. Such tokens preserve
136 // the INCOMPLETEness of the previous token (if a FILLER or STRING).
137
138#define WUFFS_BASE__TOKEN__VBD__UNICODE_CODE_POINT__MAX_INCL 0x10FFFF
139
140 // --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100141
142#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL 0x00001
143#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__UNDEFINED 0x00101
144#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__NULL 0x00201
145#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__FALSE 0x00401
146#define WUFFS_BASE__TOKEN__VBD__NUMBER__LITERAL__TRUE 0x00801
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100147
Nigel Taoa9d14882020-02-25 12:12:31 +1100148// For a source string of "123" or "0x9A", it is valid for a tokenizer to
149// return any one of:
150// - WUFFS_BASE__TOKEN__VBD__NUMBER__FLOATING_POINT.
151// - WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_SIGNED.
152// - WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_UNSIGNED.
153//
154// For a source string of "+123" or "-0x9A", only the first two are valid.
155//
156// For a source string of "123.", only the first one is valid.
157#define WUFFS_BASE__TOKEN__VBD__NUMBER__FLOATING_POINT 0x00002
158#define WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_SIGNED 0x00004
159#define WUFFS_BASE__TOKEN__VBD__NUMBER__INTEGER_UNSIGNED 0x00008
160
Nigel Tao9d4eeb72020-02-26 11:36:30 +1100161 // --------
162
163#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__PUSH 0x00001
164#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__POP 0x00002
165#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_NONE 0x00010
166#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_LIST 0x00020
167#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__FROM_DICT 0x00040
168#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_NONE 0x01000
169#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_LIST 0x02000
170#define WUFFS_BASE__TOKEN__VBD__STRUCTURE__TO_DICT 0x04000
171
172// --------
Nigel Taoa9d14882020-02-25 12:12:31 +1100173
Nigel Tao36857982020-02-12 11:33:13 +1100174static inline uint64_t //
175wuffs_base__token__value(const wuffs_base__token* t) {
176 return (t->repr >> WUFFS_BASE__TOKEN__VALUE__SHIFT) &
177 WUFFS_BASE__TOKEN__VALUE__MASK;
178}
179
180static inline uint64_t //
181wuffs_base__token__value_major(const wuffs_base__token* t) {
182 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT) &
183 WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK;
184}
185
186static inline uint64_t //
187wuffs_base__token__value_minor(const wuffs_base__token* t) {
188 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) &
189 WUFFS_BASE__TOKEN__VALUE_MINOR__MASK;
190}
191
192static inline uint64_t //
193wuffs_base__token__value_base_category(const wuffs_base__token* t) {
194 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT) &
195 WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK;
196}
197
198static inline uint64_t //
199wuffs_base__token__value_base_detail(const wuffs_base__token* t) {
200 return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) &
201 WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK;
202}
203
204static inline uint64_t //
205wuffs_base__token__length(const wuffs_base__token* t) {
206 return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) &
207 WUFFS_BASE__TOKEN__LENGTH__MASK;
208}
209
210#ifdef __cplusplus
211
212inline uint64_t //
213wuffs_base__token::value() const {
214 return wuffs_base__token__value(this);
215}
216
217inline uint64_t //
218wuffs_base__token::value_major() const {
219 return wuffs_base__token__value_major(this);
220}
221
222inline uint64_t //
223wuffs_base__token::value_minor() const {
224 return wuffs_base__token__value_minor(this);
225}
226
227inline uint64_t //
228wuffs_base__token::value_base_category() const {
229 return wuffs_base__token__value_base_category(this);
230}
231
232inline uint64_t //
233wuffs_base__token::value_base_detail() const {
234 return wuffs_base__token__value_base_detail(this);
235}
236
237inline uint64_t //
238wuffs_base__token::length() const {
239 return wuffs_base__token__length(this);
240}
241
242#endif // __cplusplus
243
244// --------
245
Nigel Tao737e31f2020-02-11 11:23:17 +1100246typedef WUFFS_BASE__SLICE(wuffs_base__token) wuffs_base__slice_token;
247
248static inline wuffs_base__slice_token //
249wuffs_base__make_slice_token(wuffs_base__token* ptr, size_t len) {
250 wuffs_base__slice_token ret;
251 ret.ptr = ptr;
252 ret.len = len;
253 return ret;
254}
255
Nigel Tao36857982020-02-12 11:33:13 +1100256// --------
257
Nigel Tao737e31f2020-02-11 11:23:17 +1100258// wuffs_base__token_buffer_meta is the metadata for a
259// wuffs_base__token_buffer's data.
260typedef struct {
261 size_t wi; // Write index. Invariant: wi <= len.
262 size_t ri; // Read index. Invariant: ri <= wi.
263 uint64_t pos; // Position of the buffer start relative to the stream start.
264 bool closed; // No further writes are expected.
265} wuffs_base__token_buffer_meta;
266
267// wuffs_base__token_buffer is a 1-dimensional buffer (a pointer and length)
268// plus additional metadata.
269//
270// A value with all fields zero is a valid, empty buffer.
271typedef struct {
272 wuffs_base__slice_token data;
273 wuffs_base__token_buffer_meta meta;
274
275#ifdef __cplusplus
276 inline void compact();
277 inline uint64_t reader_available() const;
278 inline uint64_t reader_token_position() const;
279 inline uint64_t writer_available() const;
280 inline uint64_t writer_token_position() const;
281#endif // __cplusplus
282
283} wuffs_base__token_buffer;
284
285static inline wuffs_base__token_buffer //
286wuffs_base__make_token_buffer(wuffs_base__slice_token data,
287 wuffs_base__token_buffer_meta meta) {
288 wuffs_base__token_buffer ret;
289 ret.data = data;
290 ret.meta = meta;
291 return ret;
292}
293
294static inline wuffs_base__token_buffer_meta //
295wuffs_base__make_token_buffer_meta(size_t wi,
296 size_t ri,
297 uint64_t pos,
298 bool closed) {
299 wuffs_base__token_buffer_meta ret;
300 ret.wi = wi;
301 ret.ri = ri;
302 ret.pos = pos;
303 ret.closed = closed;
304 return ret;
305}
306
307static inline wuffs_base__token_buffer //
308wuffs_base__empty_token_buffer() {
309 wuffs_base__token_buffer ret;
310 ret.data.ptr = NULL;
311 ret.data.len = 0;
312 ret.meta.wi = 0;
313 ret.meta.ri = 0;
314 ret.meta.pos = 0;
315 ret.meta.closed = false;
316 return ret;
317}
318
319static inline wuffs_base__token_buffer_meta //
320wuffs_base__empty_token_buffer_meta() {
321 wuffs_base__token_buffer_meta ret;
322 ret.wi = 0;
323 ret.ri = 0;
324 ret.pos = 0;
325 ret.closed = false;
326 return ret;
327}
328
329// wuffs_base__token_buffer__compact moves any written but unread tokens to the
330// start of the buffer.
331static inline void //
332wuffs_base__token_buffer__compact(wuffs_base__token_buffer* buf) {
333 if (!buf || (buf->meta.ri == 0)) {
334 return;
335 }
336 buf->meta.pos = wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri);
337 size_t n = buf->meta.wi - buf->meta.ri;
338 if (n != 0) {
339 memmove(buf->data.ptr, buf->data.ptr + buf->meta.ri,
340 n * sizeof(wuffs_base__token));
341 }
342 buf->meta.wi = n;
343 buf->meta.ri = 0;
344}
345
346static inline uint64_t //
347wuffs_base__token_buffer__reader_available(
348 const wuffs_base__token_buffer* buf) {
349 return buf ? buf->meta.wi - buf->meta.ri : 0;
350}
351
352static inline uint64_t //
353wuffs_base__token_buffer__reader_token_position(
354 const wuffs_base__token_buffer* buf) {
355 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.ri) : 0;
356}
357
358static inline uint64_t //
359wuffs_base__token_buffer__writer_available(
360 const wuffs_base__token_buffer* buf) {
361 return buf ? buf->data.len - buf->meta.wi : 0;
362}
363
364static inline uint64_t //
365wuffs_base__token_buffer__writer_token_position(
366 const wuffs_base__token_buffer* buf) {
367 return buf ? wuffs_base__u64__sat_add(buf->meta.pos, buf->meta.wi) : 0;
368}
369
370#ifdef __cplusplus
371
372inline void //
373wuffs_base__token_buffer::compact() {
374 wuffs_base__token_buffer__compact(this);
375}
376
377inline uint64_t //
378wuffs_base__token_buffer::reader_available() const {
379 return wuffs_base__token_buffer__reader_available(this);
380}
381
382inline uint64_t //
383wuffs_base__token_buffer::reader_token_position() const {
384 return wuffs_base__token_buffer__reader_token_position(this);
385}
386
387inline uint64_t //
388wuffs_base__token_buffer::writer_available() const {
389 return wuffs_base__token_buffer__writer_available(this);
390}
391
392inline uint64_t //
393wuffs_base__token_buffer::writer_token_position() const {
394 return wuffs_base__token_buffer__writer_token_position(this);
395}
396
397#endif // __cplusplus