| /* ----------------------------------------------------------------------- * |
| * |
| * Copyright 1996-2019 The NASM Authors - All Rights Reserved |
| * See the file AUTHORS included with the NASM distribution for |
| * the specific copyright holders. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following |
| * conditions are met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following |
| * disclaimer in the documentation and/or other materials provided |
| * with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
| * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
| * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
| * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
| * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
| * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| * ----------------------------------------------------------------------- */ |
| |
| /* |
| * quote.c |
| */ |
| |
| #include "compiler.h" |
| #include "nasmlib.h" |
| #include "quote.h" |
| #include "nctype.h" |
| #include "error.h" |
| |
| /* |
| * Create a NASM quoted string in newly allocated memory. Update the |
| * *lenp parameter with the output length (sans final NUL). |
| */ |
| |
| char *nasm_quote(const char *str, size_t *lenp) |
| { |
| const char *p, *ep; |
| char c, c1, *q, *nstr; |
| unsigned char uc; |
| bool sq_ok, dq_ok; |
| size_t qlen; |
| size_t len = *lenp; |
| |
| sq_ok = dq_ok = true; |
| ep = str+len; |
| qlen = 0; /* Length if we need `...` quotes */ |
| for (p = str; p < ep; p++) { |
| c = *p; |
| switch (c) { |
| case '\'': |
| sq_ok = false; |
| qlen++; |
| break; |
| case '\"': |
| dq_ok = false; |
| qlen++; |
| break; |
| case '`': |
| case '\\': |
| qlen += 2; |
| break; |
| default: |
| if (c < ' ' || c > '~') { |
| sq_ok = dq_ok = false; |
| switch (c) { |
| case '\a': |
| case '\b': |
| case '\t': |
| case '\n': |
| case '\v': |
| case '\f': |
| case '\r': |
| case 27: |
| qlen += 2; |
| break; |
| default: |
| c1 = (p+1 < ep) ? p[1] : 0; |
| if (c1 >= '0' && c1 <= '7') |
| uc = 0377; /* Must use the full form */ |
| else |
| uc = c; |
| if (uc > 077) |
| qlen++; |
| if (uc > 07) |
| qlen++; |
| qlen += 2; |
| break; |
| } |
| } else { |
| qlen++; |
| } |
| break; |
| } |
| } |
| |
| if (sq_ok || dq_ok) { |
| /* Use '...' or "..." */ |
| nstr = nasm_malloc(len+3); |
| nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"'; |
| q = &nstr[len+2]; |
| if (len > 0) |
| memcpy(nstr+1, str, len); |
| } else { |
| /* Need to use `...` quoted syntax */ |
| nstr = nasm_malloc(qlen+3); |
| q = nstr; |
| *q++ = '`'; |
| for (p = str; p < ep; p++) { |
| c = *p; |
| switch (c) { |
| case '`': |
| case '\\': |
| *q++ = '\\'; |
| *q++ = c; |
| break; |
| case 7: |
| *q++ = '\\'; |
| *q++ = 'a'; |
| break; |
| case 8: |
| *q++ = '\\'; |
| *q++ = 'b'; |
| break; |
| case 9: |
| *q++ = '\\'; |
| *q++ = 't'; |
| break; |
| case 10: |
| *q++ = '\\'; |
| *q++ = 'n'; |
| break; |
| case 11: |
| *q++ = '\\'; |
| *q++ = 'v'; |
| break; |
| case 12: |
| *q++ = '\\'; |
| *q++ = 'f'; |
| break; |
| case 13: |
| *q++ = '\\'; |
| *q++ = 'r'; |
| break; |
| case 27: |
| *q++ = '\\'; |
| *q++ = 'e'; |
| break; |
| default: |
| if (c < ' ' || c > '~') { |
| c1 = (p+1 < ep) ? p[1] : 0; |
| if (c1 >= '0' && c1 <= '7') |
| uc = 0377; /* Must use the full form */ |
| else |
| uc = c; |
| *q++ = '\\'; |
| if (uc > 077) |
| *q++ = ((unsigned char)c >> 6) + '0'; |
| if (uc > 07) |
| *q++ = (((unsigned char)c >> 3) & 7) + '0'; |
| *q++ = ((unsigned char)c & 7) + '0'; |
| break; |
| } else { |
| *q++ = c; |
| } |
| break; |
| } |
| } |
| *q++ = '`'; |
| nasm_assert((size_t)(q-nstr) == qlen+2); |
| } |
| *q = '\0'; |
| *lenp = q - nstr; |
| return nstr; |
| } |
| |
| static unsigned char *emit_utf8(unsigned char *q, uint32_t v) |
| { |
| uint32_t vb1, vb2, vb3, vb4, vb5; |
| |
| if (v <= 0x7f) { |
| *q++ = v; |
| goto out0; |
| } |
| |
| vb1 = v >> 6; |
| if (vb1 <= 0x1f) { |
| *q++ = 0xc0 + vb1; |
| goto out1; |
| } |
| |
| vb2 = vb1 >> 6; |
| if (vb2 <= 0x0f) { |
| *q++ = 0xe0 + vb2; |
| goto out2; |
| } |
| |
| vb3 = vb2 >> 6; |
| if (vb3 <= 0x07) { |
| *q++ = 0xf0 + vb3; |
| goto out3; |
| } |
| |
| vb4 = vb3 >> 6; |
| if (vb4 <= 0x03) { |
| *q++ = 0xf8 + vb4; |
| goto out4; |
| } |
| |
| /* |
| * Note: this is invalid even for "classic" (pre-UTF16) 31-bit |
| * UTF-8 if the value is >= 0x8000000. This at least tries to do |
| * something vaguely sensible with it. Caveat programmer. |
| * The __utf*__ string transform functions do reject these |
| * as invalid input. |
| * |
| * vb5 cannot be more than 3, as a 32-bit value has been shifted |
| * right by 5*6 = 30 bits already. |
| */ |
| vb5 = vb4 >> 6; |
| *q++ = 0xfc + vb5; |
| goto out5; |
| |
| /* Emit extension bytes as appropriate */ |
| out5: *q++ = 0x80 + (vb4 & 63); |
| out4: *q++ = 0x80 + (vb3 & 63); |
| out3: *q++ = 0x80 + (vb2 & 63); |
| out2: *q++ = 0x80 + (vb1 & 63); |
| out1: *q++ = 0x80 + (v & 63); |
| out0: return q; |
| } |
| |
| static inline uint32_t ctlbit(uint32_t v) |
| { |
| return unlikely(v < 32) ? UINT32_C(1) << v : 0; |
| } |
| |
| #define CTL_ERR(c) \ |
| (badctl & (ctlmask |= ctlbit(c))) |
| |
| #define EMIT_UTF8(c) \ |
| do { \ |
| uint32_t ec = (c); \ |
| if (!CTL_ERR(ec)) \ |
| q = emit_utf8(q, ec); \ |
| } while (0) |
| |
| #define EMIT(c) \ |
| do { \ |
| unsigned char ec = (c); \ |
| if (!CTL_ERR(ec)) \ |
| *q++ = ec; \ |
| } while (0) |
| |
| /* |
| * Same as nasm_quote, but take the length of a C string; |
| * the lenp argument is optional. |
| */ |
| char *nasm_quote_cstr(const char *str, size_t *lenp) |
| { |
| size_t len = strlen(str); |
| char *qstr = nasm_quote(str, &len); |
| if (lenp) |
| *lenp = len; |
| return qstr; |
| } |
| |
| /* |
| * Do an *in-place* dequoting of the specified string, returning the |
| * resulting length (which may be containing embedded nulls.) |
| * |
| * In-place replacement is possible since the unquoted length is always |
| * shorter than or equal to the quoted length. |
| * |
| * *ep points to the final quote, or to the null if improperly quoted. |
| * |
| * Issue an error if the string contains control characters |
| * corresponding to bits set in badctl; in that case, the output |
| * string, but not *ep, is truncated before the first invalid |
| * character. |
| */ |
| |
| static size_t nasm_unquote_common(char *str, char **ep, |
| const uint32_t badctl) |
| { |
| unsigned char bq; |
| const unsigned char *p; |
| const unsigned char *escp = NULL; |
| unsigned char *q; |
| unsigned char c; |
| uint32_t ctlmask = 0; /* Mask of control characters seen */ |
| enum unq_state { |
| st_start, |
| st_backslash, |
| st_hex, |
| st_oct, |
| st_ucs, |
| st_done |
| } state; |
| int ndig = 0; |
| uint32_t nval = 0; |
| |
| p = q = (unsigned char *)str; |
| |
| bq = *p++; |
| if (!bq) |
| return 0; |
| |
| switch (bq) { |
| case '\'': |
| case '\"': |
| /* '...' or "..." string */ |
| while ((c = *p++) && (c != bq)) |
| EMIT(c); |
| break; |
| |
| case '`': |
| /* `...` string */ |
| state = st_start; |
| |
| while (state != st_done) { |
| c = *p++; |
| switch (state) { |
| case st_start: |
| switch (c) { |
| case '\\': |
| state = st_backslash; |
| break; |
| case '`': |
| case '\0': |
| state = st_done; |
| break; |
| default: |
| EMIT(c); |
| break; |
| } |
| break; |
| |
| case st_backslash: |
| state = st_start; |
| escp = p; /* Beginning of argument sequence */ |
| nval = 0; |
| switch (c) { |
| case 'a': |
| nval = 7; |
| break; |
| case 'b': |
| nval = 8; |
| break; |
| case 'e': |
| nval = 27; |
| break; |
| case 'f': |
| nval = 12; |
| break; |
| case 'n': |
| nval = 10; |
| break; |
| case 'r': |
| nval = 13; |
| break; |
| case 't': |
| nval = 9; |
| break; |
| case 'u': |
| state = st_ucs; |
| ndig = 4; |
| break; |
| case 'U': |
| state = st_ucs; |
| ndig = 8; |
| break; |
| case 'v': |
| nval = 11; |
| break; |
| case 'x': |
| case 'X': |
| state = st_hex; |
| ndig = 2; |
| break; |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| state = st_oct; |
| ndig = 2; /* Up to two more digits */ |
| nval = c - '0'; |
| break; |
| case '\0': |
| nval = '\\'; |
| p--; /* Reprocess; terminates string */ |
| break; |
| default: |
| nval = c; |
| break; |
| } |
| if (state == st_start) |
| EMIT(nval); |
| break; |
| |
| case st_oct: |
| if (c >= '0' && c <= '7') { |
| nval = (nval << 3) + (c - '0'); |
| if (--ndig) |
| break; /* Might have more digits */ |
| } else { |
| p--; /* Process this character again */ |
| } |
| EMIT(nval); |
| state = st_start; |
| break; |
| |
| case st_hex: |
| case st_ucs: |
| if (nasm_isxdigit(c)) { |
| nval = (nval << 4) + numvalue(c); |
| if (--ndig) |
| break; /* Might have more digits */ |
| } else { |
| p--; /* Process this character again */ |
| } |
| |
| if (unlikely(p <= escp)) |
| EMIT(escp[-1]); |
| else if (state == st_ucs) |
| EMIT_UTF8(nval); |
| else |
| EMIT(nval); |
| |
| state = st_start; |
| break; |
| |
| default: |
| panic(); |
| } |
| } |
| break; |
| |
| default: |
| /* Not a quoted string, just return the input... */ |
| while ((c = *p++)) |
| EMIT(c); |
| break; |
| } |
| |
| /* Zero-terminate the output */ |
| *q = '\0'; |
| |
| if (ctlmask & badctl) |
| nasm_nonfatal("control character in string not allowed here"); |
| |
| if (ep) |
| *ep = (char *)p - 1; |
| return (char *)q - str; |
| } |
| #undef EMIT |
| |
| size_t nasm_unquote(char *str, char **ep) |
| { |
| return nasm_unquote_common(str, ep, 0); |
| } |
| size_t nasm_unquote_cstr(char *str, char **ep) |
| { |
| /* |
| * These are the only control characters permitted: BEL BS TAB ESC |
| */ |
| const uint32_t okctl = (1 << '\a') | (1 << '\b') | (1 << '\t') | (1 << 27); |
| |
| return nasm_unquote_common(str, ep, ~okctl); |
| } |
| |
| /* |
| * Find the end of a quoted string; returns the pointer to the terminating |
| * character (either the ending quote or the null character, if unterminated.) |
| * If the input is not a quoted string, return NULL. |
| */ |
| char *nasm_skip_string(const char *str) |
| { |
| char bq; |
| const char *p; |
| char c; |
| enum unq_state { |
| st_start, |
| st_backslash, |
| st_done |
| } state; |
| |
| bq = str[0]; |
| p = str+1; |
| switch (bq) { |
| case '\'': |
| case '\"': |
| /* '...' or "..." string */ |
| while ((c = *p++) && (c != bq)) |
| ; |
| break; |
| |
| case '`': |
| /* `...` string */ |
| state = st_start; |
| while (state != st_done) { |
| c = *p++; |
| switch (state) { |
| case st_start: |
| switch (c) { |
| case '\\': |
| state = st_backslash; |
| break; |
| case '`': |
| case '\0': |
| state = st_done; |
| break; |
| default: |
| break; |
| } |
| break; |
| |
| case st_backslash: |
| /* |
| * Note: for the purpose of finding the end of the string, |
| * all successor states to st_backslash are functionally |
| * equivalent to st_start, since either a backslash or |
| * a backquote will force a return to the st_start state. |
| */ |
| state = c ? st_start : st_done; |
| break; |
| |
| default: |
| panic(); |
| } |
| } |
| break; |
| |
| default: |
| /* Not a string at all... */ |
| return NULL; |
| } |
| return (char *)p - 1; |
| } |