Validation: add UTF-8 validation support
This implements:
- CborValidateUtf8
Signed-off-by: Thiago Macieira <thiago.macieira@intel.com>
diff --git a/src/cborpretty.c b/src/cborpretty.c
index 12c7d81..b4570b2 100644
--- a/src/cborpretty.c
+++ b/src/cborpretty.c
@@ -31,6 +31,7 @@
#include "cbor.h"
#include "cborinternal_p.h"
#include "compilersupport_p.h"
+#include "utf8_p.h"
#include <float.h>
#include <inttypes.h>
@@ -161,10 +162,13 @@
* On UTF-8 decoding error, it returns CborErrorInvalidUtf8TextString */
static CborError utf8EscapedDump(FILE *out, const void *ptr, size_t n)
{
- const char *buffer = (const char *)ptr;
- uint32_t uc;
- while (n--) {
- uc = (uint8_t)*buffer++;
+ const uint8_t *buffer = (const uint8_t *)ptr;
+ const uint8_t * const end = buffer + n;
+ while (buffer < end) {
+ uint32_t uc = get_utf8(&buffer, end);
+ if (uc == ~0U)
+ return CborErrorInvalidUtf8TextString;
+
if (uc < 0x80) {
/* single-byte UTF-8 */
if (uc < 0x7f && uc >= 0x20 && uc != '\\' && uc != '"') {
@@ -202,65 +206,8 @@
continue;
}
- /* multi-byte UTF-8, decode it */
- unsigned charsNeeded;
- uint32_t min_uc;
- if (unlikely(uc <= 0xC1))
- return CborErrorInvalidUtf8TextString;
- if (uc < 0xE0) {
- /* two-byte UTF-8 */
- charsNeeded = 2;
- min_uc = 0x80;
- uc &= 0x1f;
- } else if (uc < 0xF0) {
- /* three-byte UTF-8 */
- charsNeeded = 3;
- min_uc = 0x800;
- uc &= 0x0f;
- } else if (uc < 0xF5) {
- /* four-byte UTF-8 */
- charsNeeded = 4;
- min_uc = 0x10000;
- uc &= 0x07;
- } else {
- return CborErrorInvalidUtf8TextString;
- }
-
- if (n < charsNeeded - 1)
- return CborErrorInvalidUtf8TextString;
- n -= charsNeeded - 1;
-
- /* first continuation character */
- uint8_t b = (uint8_t)*buffer++;
- if ((b & 0xc0) != 0x80)
- return CborErrorInvalidUtf8TextString;
- uc <<= 6;
- uc |= b & 0x3f;
-
- if (charsNeeded > 2) {
- /* second continuation character */
- b = (uint8_t)*buffer++;
- if ((b & 0xc0) != 0x80)
- return CborErrorInvalidUtf8TextString;
- uc <<= 6;
- uc |= b & 0x3f;
-
- if (charsNeeded > 3) {
- /* third continuation character */
- b = (uint8_t)*buffer++;
- if ((b & 0xc0) != 0x80)
- return CborErrorInvalidUtf8TextString;
- uc <<= 6;
- uc |= b & 0x3f;
- }
- }
-
- /* overlong sequence? surrogate pair? out or range? */
- if (uc < min_uc || uc - 0xd800U < 2048U || uc > 0x10ffff)
- return CborErrorInvalidUtf8TextString;
-
/* now print the sequence */
- if (charsNeeded > 3) {
+ if (uc > 0xffffU) {
/* needs surrogate pairs */
if (fprintf(out, "\\u%04" PRIX32 "\\u%04" PRIX32,
(uc >> 10) + 0xd7c0, /* high surrogate */
diff --git a/src/cborvalidation.c b/src/cborvalidation.c
index 7bbf344..0f28ce9 100644
--- a/src/cborvalidation.c
+++ b/src/cborvalidation.c
@@ -31,6 +31,7 @@
#include "cbor.h"
#include "cborinternal_p.h"
#include "compilersupport_p.h"
+#include "utf8_p.h"
#include <string.h>
@@ -227,6 +228,18 @@
static CborError validate_value(CborValue *it, int flags, int recursionLeft);
+static inline CborError validate_utf8_string(const void *ptr, size_t n)
+{
+ const uint8_t *buffer = (const uint8_t *)ptr;
+ const uint8_t * const end = buffer + n;
+ while (buffer < end) {
+ uint32_t uc = get_utf8(&buffer, end);
+ if (uc == ~0U)
+ return CborErrorInvalidUtf8TextString;
+ }
+ return CborNoError;
+}
+
static inline CborError validate_simple_type(uint8_t simple_type, int flags)
{
/* At current time, all known simple types are those from RFC 7049,
@@ -421,6 +434,12 @@
return err;
if (!ptr)
break;
+
+ if (type == CborTextStringType && flags & CborValidateUtf8) {
+ err = validate_utf8_string(ptr, n);
+ if (err)
+ return err;
+ }
}
return CborNoError;
diff --git a/src/utf8_p.h b/src/utf8_p.h
new file mode 100644
index 0000000..04bf913
--- /dev/null
+++ b/src/utf8_p.h
@@ -0,0 +1,99 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 Intel Corporation
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and associated documentation files (the "Software"), to deal
+** in the Software without restriction, including without limitation the rights
+** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+** copies of the Software, and to permit persons to whom the Software is
+** furnished to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Software.
+**
+** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+** THE SOFTWARE.
+**
+****************************************************************************/
+
+#include "compilersupport_p.h"
+
+#include <stdint.h>
+
+static inline uint32_t get_utf8(const uint8_t **buffer, const uint8_t *end)
+{
+ uint32_t uc;
+ ptrdiff_t n = end - *buffer;
+ if (n == 0)
+ return ~0U;
+
+ uc = *(*buffer)++;
+ if (uc < 0x80) {
+ /* single-byte UTF-8 */
+ return uc;
+ }
+
+ /* multi-byte UTF-8, decode it */
+ int charsNeeded;
+ uint32_t min_uc;
+ if (unlikely(uc <= 0xC1))
+ return ~0U;
+ if (uc < 0xE0) {
+ /* two-byte UTF-8 */
+ charsNeeded = 2;
+ min_uc = 0x80;
+ uc &= 0x1f;
+ } else if (uc < 0xF0) {
+ /* three-byte UTF-8 */
+ charsNeeded = 3;
+ min_uc = 0x800;
+ uc &= 0x0f;
+ } else if (uc < 0xF5) {
+ /* four-byte UTF-8 */
+ charsNeeded = 4;
+ min_uc = 0x10000;
+ uc &= 0x07;
+ } else {
+ return ~0U;
+ }
+
+ if (n < charsNeeded - 1)
+ return ~0U;
+
+ /* first continuation character */
+ uint8_t b = *(*buffer)++;
+ if ((b & 0xc0) != 0x80)
+ return ~0U;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 2) {
+ /* second continuation character */
+ b = *(*buffer)++;
+ if ((b & 0xc0) != 0x80)
+ return ~0U;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 3) {
+ /* third continuation character */
+ b = *(*buffer)++;
+ if ((b & 0xc0) != 0x80)
+ return ~0U;
+ uc <<= 6;
+ uc |= b & 0x3f;
+ }
+ }
+
+ /* overlong sequence? surrogate pair? out or range? */
+ if (uc < min_uc || uc - 0xd800U < 2048U || uc > 0x10ffff)
+ return ~0U;
+
+ return uc;
+}
diff --git a/tests/parser/tst_parser.cpp b/tests/parser/tst_parser.cpp
index ee5f23d..d6653ba 100644
--- a/tests/parser/tst_parser.cpp
+++ b/tests/parser/tst_parser.cpp
@@ -1617,6 +1617,39 @@
QTest::newRow("tag-4294967296") << raw("\xdb\0\0\0\1\0\0\0\0\x60") << int(CborValidateCanonicalFormat) << CborNoError;
// strict mode
+ QTest::newRow("invalid-utf8-1char") << raw("\x61\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-2chars-1") << raw("\x62\xc2\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-2chars-2") << raw("\x62\xc3\xdf") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-2chars-3") << raw("\x62\xc7\xf0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-3chars-1") << raw("\x63\xe0\xa0\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-3chars-2") << raw("\x63\xe0\xc0\xa0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-4chars-1") << raw("\x64\xf0\x90\x80\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-4chars-2") << raw("\x64\xf0\x90\xc0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-4chars-3") << raw("\x64\xf0\xc0\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-hi-surrogate") << raw("\x63\xed\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-lo-surrogate") << raw("\x63\xed\xb0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-surrogate-pair") << raw("\x66\xed\xa0\x80\xed\xb0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-non-unicode-1") << raw("\x64\xf4\x90\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-non-unicode-2") << raw("\x65\xf8\x88\x80\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-non-unicode-3") << raw("\x66\xfc\x84\x80\x80\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-non-unicode-4") << raw("\x66\xfd\xbf\xbf\xbf\xbf\xbf") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-fe") << raw("\x61\xfe") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-ff") << raw("\x61\xff") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-1-2") << raw("\x62\xc1\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-1-3") << raw("\x63\xe0\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-1-4") << raw("\x64\xf0\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-1-5") << raw("\x65\xf8\x80\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-1-6") << raw("\x66\xfc\x80\x80\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-2-3") << raw("\x63\xe0\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-2-4") << raw("\x64\xf0\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-2-5") << raw("\x65\xf8\x80\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-2-6") << raw("\x66\xfc\x80\x80\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-3-4") << raw("\x64\xf0\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-3-5") << raw("\x65\xf8\x80\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-3-6") << raw("\x66\xfc\x80\x80\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-4-5") << raw("\x65\xf8\x80\x84\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+ QTest::newRow("invalid-utf8-overlong-4-6") << raw("\x66\xfc\x80\x80\x84\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+
QTest::newRow("tag-0-unsigned") << raw("\xc0\x00") << int(CborValidateStrictMode) << CborErrorInappropriateTagForType;
QTest::newRow("tag-0-bytearray") << raw("\xc0\x40") << int(CborValidateStrictMode) << CborErrorInappropriateTagForType;
QTest::newRow("tag-0-string") << raw("\xc0\x60") << int(CborValidateStrictMode) << CborNoError;