Validation: add UTF-8 validation support

This implements:
 - CborValidateUtf8

Signed-off-by: Thiago Macieira <thiago.macieira@intel.com>
diff --git a/src/cborpretty.c b/src/cborpretty.c
index 12c7d81..b4570b2 100644
--- a/src/cborpretty.c
+++ b/src/cborpretty.c
@@ -31,6 +31,7 @@
 #include "cbor.h"
 #include "cborinternal_p.h"
 #include "compilersupport_p.h"
+#include "utf8_p.h"
 
 #include <float.h>
 #include <inttypes.h>
@@ -161,10 +162,13 @@
  * On UTF-8 decoding error, it returns CborErrorInvalidUtf8TextString */
 static CborError utf8EscapedDump(FILE *out, const void *ptr, size_t n)
 {
-    const char *buffer = (const char *)ptr;
-    uint32_t uc;
-    while (n--) {
-        uc = (uint8_t)*buffer++;
+    const uint8_t *buffer = (const uint8_t *)ptr;
+    const uint8_t * const end = buffer + n;
+    while (buffer < end) {
+        uint32_t uc = get_utf8(&buffer, end);
+        if (uc == ~0U)
+            return CborErrorInvalidUtf8TextString;
+
         if (uc < 0x80) {
             /* single-byte UTF-8 */
             if (uc < 0x7f && uc >= 0x20 && uc != '\\' && uc != '"') {
@@ -202,65 +206,8 @@
             continue;
         }
 
-        /* multi-byte UTF-8, decode it */
-        unsigned charsNeeded;
-        uint32_t min_uc;
-        if (unlikely(uc <= 0xC1))
-            return CborErrorInvalidUtf8TextString;
-        if (uc < 0xE0) {
-            /* two-byte UTF-8 */
-            charsNeeded = 2;
-            min_uc = 0x80;
-            uc &= 0x1f;
-        } else if (uc < 0xF0) {
-            /* three-byte UTF-8 */
-            charsNeeded = 3;
-            min_uc = 0x800;
-            uc &= 0x0f;
-        } else if (uc < 0xF5) {
-            /* four-byte UTF-8 */
-            charsNeeded = 4;
-            min_uc = 0x10000;
-            uc &= 0x07;
-        } else {
-            return CborErrorInvalidUtf8TextString;
-        }
-
-        if (n < charsNeeded - 1)
-            return CborErrorInvalidUtf8TextString;
-        n -= charsNeeded - 1;
-
-        /* first continuation character */
-        uint8_t b = (uint8_t)*buffer++;
-        if ((b & 0xc0) != 0x80)
-            return CborErrorInvalidUtf8TextString;
-        uc <<= 6;
-        uc |= b & 0x3f;
-
-        if (charsNeeded > 2) {
-            /* second continuation character */
-            b = (uint8_t)*buffer++;
-            if ((b & 0xc0) != 0x80)
-                return CborErrorInvalidUtf8TextString;
-            uc <<= 6;
-            uc |= b & 0x3f;
-
-            if (charsNeeded > 3) {
-                /* third continuation character */
-                b = (uint8_t)*buffer++;
-                if ((b & 0xc0) != 0x80)
-                    return CborErrorInvalidUtf8TextString;
-                uc <<= 6;
-                uc |= b & 0x3f;
-            }
-        }
-
-        /* overlong sequence? surrogate pair? out or range? */
-        if (uc < min_uc || uc - 0xd800U < 2048U || uc > 0x10ffff)
-            return CborErrorInvalidUtf8TextString;
-
         /* now print the sequence */
-        if (charsNeeded > 3) {
+        if (uc > 0xffffU) {
             /* needs surrogate pairs */
             if (fprintf(out, "\\u%04" PRIX32 "\\u%04" PRIX32,
                         (uc >> 10) + 0xd7c0,    /* high surrogate */
diff --git a/src/cborvalidation.c b/src/cborvalidation.c
index 7bbf344..0f28ce9 100644
--- a/src/cborvalidation.c
+++ b/src/cborvalidation.c
@@ -31,6 +31,7 @@
 #include "cbor.h"
 #include "cborinternal_p.h"
 #include "compilersupport_p.h"
+#include "utf8_p.h"
 
 #include <string.h>
 
@@ -227,6 +228,18 @@
 
 static CborError validate_value(CborValue *it, int flags, int recursionLeft);
 
+static inline CborError validate_utf8_string(const void *ptr, size_t n)
+{
+    const uint8_t *buffer = (const uint8_t *)ptr;
+    const uint8_t * const end = buffer + n;
+    while (buffer < end) {
+        uint32_t uc = get_utf8(&buffer, end);
+        if (uc == ~0U)
+            return CborErrorInvalidUtf8TextString;
+    }
+    return CborNoError;
+}
+
 static inline CborError validate_simple_type(uint8_t simple_type, int flags)
 {
     /* At current time, all known simple types are those from RFC 7049,
@@ -421,6 +434,12 @@
                 return err;
             if (!ptr)
                 break;
+
+            if (type == CborTextStringType && flags & CborValidateUtf8) {
+                err = validate_utf8_string(ptr, n);
+                if (err)
+                    return err;
+            }
         }
 
         return CborNoError;
diff --git a/src/utf8_p.h b/src/utf8_p.h
new file mode 100644
index 0000000..04bf913
--- /dev/null
+++ b/src/utf8_p.h
@@ -0,0 +1,99 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 Intel Corporation
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and associated documentation files (the "Software"), to deal
+** in the Software without restriction, including without limitation the rights
+** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+** copies of the Software, and to permit persons to whom the Software is
+** furnished to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Software.
+**
+** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+** THE SOFTWARE.
+**
+****************************************************************************/
+
+#include "compilersupport_p.h"
+
+#include <stdint.h>
+
+static inline uint32_t get_utf8(const uint8_t **buffer, const uint8_t *end)
+{
+    uint32_t uc;
+    ptrdiff_t n = end - *buffer;
+    if (n == 0)
+        return ~0U;
+
+    uc = *(*buffer)++;
+    if (uc < 0x80) {
+        /* single-byte UTF-8 */
+        return uc;
+    }
+
+    /* multi-byte UTF-8, decode it */
+    int charsNeeded;
+    uint32_t min_uc;
+    if (unlikely(uc <= 0xC1))
+        return ~0U;
+    if (uc < 0xE0) {
+        /* two-byte UTF-8 */
+        charsNeeded = 2;
+        min_uc = 0x80;
+        uc &= 0x1f;
+    } else if (uc < 0xF0) {
+        /* three-byte UTF-8 */
+        charsNeeded = 3;
+        min_uc = 0x800;
+        uc &= 0x0f;
+    } else if (uc < 0xF5) {
+        /* four-byte UTF-8 */
+        charsNeeded = 4;
+        min_uc = 0x10000;
+        uc &= 0x07;
+    } else {
+        return ~0U;
+    }
+
+    if (n < charsNeeded - 1)
+        return ~0U;
+
+    /* first continuation character */
+    uint8_t b = *(*buffer)++;
+    if ((b & 0xc0) != 0x80)
+        return ~0U;
+    uc <<= 6;
+    uc |= b & 0x3f;
+
+    if (charsNeeded > 2) {
+        /* second continuation character */
+        b = *(*buffer)++;
+        if ((b & 0xc0) != 0x80)
+            return ~0U;
+        uc <<= 6;
+        uc |= b & 0x3f;
+
+        if (charsNeeded > 3) {
+            /* third continuation character */
+            b = *(*buffer)++;
+            if ((b & 0xc0) != 0x80)
+                return ~0U;
+            uc <<= 6;
+            uc |= b & 0x3f;
+        }
+    }
+
+    /* overlong sequence? surrogate pair? out or range? */
+    if (uc < min_uc || uc - 0xd800U < 2048U || uc > 0x10ffff)
+        return ~0U;
+
+    return uc;
+}
diff --git a/tests/parser/tst_parser.cpp b/tests/parser/tst_parser.cpp
index ee5f23d..d6653ba 100644
--- a/tests/parser/tst_parser.cpp
+++ b/tests/parser/tst_parser.cpp
@@ -1617,6 +1617,39 @@
     QTest::newRow("tag-4294967296") << raw("\xdb\0\0\0\1\0\0\0\0\x60") << int(CborValidateCanonicalFormat) << CborNoError;
 
     // strict mode
+    QTest::newRow("invalid-utf8-1char") << raw("\x61\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-2chars-1") << raw("\x62\xc2\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-2chars-2") << raw("\x62\xc3\xdf") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-2chars-3") << raw("\x62\xc7\xf0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-3chars-1") << raw("\x63\xe0\xa0\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-3chars-2") << raw("\x63\xe0\xc0\xa0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-4chars-1") << raw("\x64\xf0\x90\x80\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-4chars-2") << raw("\x64\xf0\x90\xc0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-4chars-3") << raw("\x64\xf0\xc0\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-hi-surrogate") << raw("\x63\xed\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-lo-surrogate") << raw("\x63\xed\xb0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-surrogate-pair") << raw("\x66\xed\xa0\x80\xed\xb0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-non-unicode-1") << raw("\x64\xf4\x90\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-non-unicode-2") << raw("\x65\xf8\x88\x80\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-non-unicode-3") << raw("\x66\xfc\x84\x80\x80\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-non-unicode-4") << raw("\x66\xfd\xbf\xbf\xbf\xbf\xbf") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-fe") << raw("\x61\xfe") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-ff") << raw("\x61\xff") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-1-2") << raw("\x62\xc1\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-1-3") << raw("\x63\xe0\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-1-4") << raw("\x64\xf0\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-1-5") << raw("\x65\xf8\x80\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-1-6") << raw("\x66\xfc\x80\x80\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-2-3") << raw("\x63\xe0\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-2-4") << raw("\x64\xf0\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-2-5") << raw("\x65\xf8\x80\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-2-6") << raw("\x66\xfc\x80\x80\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-3-4") << raw("\x64\xf0\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-3-5") << raw("\x65\xf8\x80\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-3-6") << raw("\x66\xfc\x80\x80\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-4-5") << raw("\x65\xf8\x80\x84\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+    QTest::newRow("invalid-utf8-overlong-4-6") << raw("\x66\xfc\x80\x80\x84\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
+
     QTest::newRow("tag-0-unsigned") << raw("\xc0\x00") << int(CborValidateStrictMode) << CborErrorInappropriateTagForType;
     QTest::newRow("tag-0-bytearray") << raw("\xc0\x40") << int(CborValidateStrictMode) << CborErrorInappropriateTagForType;
     QTest::newRow("tag-0-string") << raw("\xc0\x60") << int(CborValidateStrictMode) << CborNoError;