Serialize UTF-8 string with Unicode escapes (#687) Squashed and merged.

commit: 42a161fc80b32cc63c1a3e7ab1c9ed588a83edaa [log] [tgz]
author: Paweł Kierski <pkierski@gmail.com> Wed Oct 04 03:19:20 2017 +0200
committer: Christopher Dunn <cdunn2001@gmail.com> Tue Oct 03 18:19:20 2017 -0700
tree: 6eac4c283519709f9eb251ef4fcade0866302a72
parent: a3a40593670d16a534a94a81063ff54201c9a63b [diff]
diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index 802f96c..0852b48 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp

@@ -81,24 +81,6 @@
 typedef std::auto_ptr<StreamWriter>   StreamWriterPtr;
 #endif
 
-static bool containsControlCharacter(const char* str) {
-  while (*str) {
-    if (isControlCharacter(*(str++)))
-      return true;
-  }
-  return false;
-}
-
-static bool containsControlCharacter0(const char* str, unsigned len) {
-  char const* end = str + len;
-  while (end != str) {
-    if (isControlCharacter(*str) || 0==*str)
-      return true;
-    ++str;
-  }
-  return false;
-}
-
 JSONCPP_STRING valueToString(LargestInt value) {
   UIntToStringBuffer buffer;
   char* current = buffer + sizeof(buffer);
@@ -176,89 +158,103 @@
 
 JSONCPP_STRING valueToString(bool value) { return value ? "true" : "false"; }
 
-JSONCPP_STRING valueToQuotedString(const char* value) {
-  if (value == NULL)
-    return "";
-  // Not sure how to handle unicode...
-  if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL &&
-      !containsControlCharacter(value))
-    return JSONCPP_STRING("\"") + value + "\"";
-  // We have to walk value and escape any special characters.
-  // Appending to JSONCPP_STRING is not efficient, but this should be rare.
-  // (Note: forward slashes are *not* rare, but I am not escaping them.)
-  JSONCPP_STRING::size_type maxsize =
-      strlen(value) * 2 + 3; // allescaped+quotes+NULL
-  JSONCPP_STRING result;
-  result.reserve(maxsize); // to avoid lots of mallocs
-  result += "\"";
-  for (const char* c = value; *c != 0; ++c) {
-    switch (*c) {
-    case '\"':
-      result += "\\\"";
-      break;
-    case '\\':
-      result += "\\\\";
-      break;
-    case '\b':
-      result += "\\b";
-      break;
-    case '\f':
-      result += "\\f";
-      break;
-    case '\n':
-      result += "\\n";
-      break;
-    case '\r':
-      result += "\\r";
-      break;
-    case '\t':
-      result += "\\t";
-      break;
-    // case '/':
-    // Even though \/ is considered a legal escape in JSON, a bare
-    // slash is also legal, so I see no reason to escape it.
-    // (I hope I am not misunderstanding something.
-    // blep notes: actually escaping \/ may be useful in javascript to avoid </
-    // sequence.
-    // Should add a flag to allow this compatibility mode and prevent this
-    // sequence from occurring.
-    default:
-      if (isControlCharacter(*c)) {
-        JSONCPP_OSTRINGSTREAM oss;
-        oss << "\\u" << std::hex << std::uppercase << std::setfill('0')
-            << std::setw(4) << static_cast<int>(*c);
-        result += oss.str();
-      } else {
-        result += *c;
-      }
-      break;
-    }
-  }
-  result += "\"";
-  return result;
-}
-
-// https://github.com/upcaste/upcaste/blob/master/src/upcore/src/cstring/strnpbrk.cpp
-static char const* strnpbrk(char const* s, char const* accept, size_t n) {
-  assert((s || !n) && accept);
+static bool isAnyCharRequiredQuoting(char const* s, size_t n) {
+  assert(s || !n);
 
   char const* const end = s + n;
   for (char const* cur = s; cur < end; ++cur) {
-    int const c = *cur;
-    for (char const* a = accept; *a; ++a) {
-      if (*a == c) {
-        return cur;
-      }
-    }
+    if (*cur == '\\' || *cur == '\"' || *cur < ' '
+      || static_cast<unsigned char>(*cur) < 0x80)
+      return true;
   }
-  return NULL;
+  return false;
 }
+
+static unsigned int utf8ToCodepoint(const char*& s, const char* e) {
+  const unsigned int REPLACEMENT_CHARACTER = 0xFFFD;
+
+  unsigned int firstByte = static_cast<unsigned char>(*s);
+
+  if (firstByte < 0x80)
+    return firstByte;
+
+  if (firstByte < 0xE0) {
+    if (e - s < 2)
+      return REPLACEMENT_CHARACTER;
+
+    unsigned int calculated = ((firstByte & 0x1F) << 6)
+      | (static_cast<unsigned int>(s[1]) & 0x3F);
+    s += 1;
+    // oversized encoded characters are invalid
+    return calculated < 0x80 ? REPLACEMENT_CHARACTER : calculated;
+  }
+
+  if (firstByte < 0xF0) {
+    if (e - s < 3)
+      return REPLACEMENT_CHARACTER;
+
+    unsigned int calculated = ((firstByte & 0x0F) << 12)
+      | ((static_cast<unsigned int>(s[1]) & 0x3F) << 6)
+      |  (static_cast<unsigned int>(s[2]) & 0x3F);
+    s += 2;
+    // surrogates aren't valid codepoints itself
+    // shouldn't be UTF-8 encoded
+    if (calculated >= 0xD800 && calculated >= 0xDFFF)
+      return REPLACEMENT_CHARACTER;
+    // oversized encoded characters are invalid
+    return calculated < 0x800 ? REPLACEMENT_CHARACTER : calculated;
+  }
+
+  if (firstByte < 0xF8) {
+    if (e - s < 4)
+      return REPLACEMENT_CHARACTER;
+
+    unsigned int calculated = ((firstByte & 0x07) << 24)
+      | ((static_cast<unsigned int>(s[1]) & 0x3F) << 12)
+      | ((static_cast<unsigned int>(s[2]) & 0x3F) << 6)
+      |  (static_cast<unsigned int>(s[3]) & 0x3F);
+    s += 3;
+    // oversized encoded characters are invalid
+    return calculated < 0x10000 ? REPLACEMENT_CHARACTER : calculated;
+  }
+
+  return REPLACEMENT_CHARACTER;
+}
+
+static const char hex2[] =
+  "000102030405060708090a0b0c0d0e0f"
+  "101112131415161718191a1b1c1d1e1f"
+  "202122232425262728292a2b2c2d2e2f"
+  "303132333435363738393a3b3c3d3e3f"
+  "404142434445464748494a4b4c4d4e4f"
+  "505152535455565758595a5b5c5d5e5f"
+  "606162636465666768696a6b6c6d6e6f"
+  "707172737475767778797a7b7c7d7e7f"
+  "808182838485868788898a8b8c8d8e8f"
+  "909192939495969798999a9b9c9d9e9f"
+  "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
+  "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
+  "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
+  "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
+  "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
+  "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
+
+static JSONCPP_STRING toHex16Bit(unsigned int x) {
+  const unsigned int hi = (x >> 8) & 0xff;
+  const unsigned int lo = x & 0xff;
+  JSONCPP_STRING result(4, ' ');
+  result[0] = hex2[2 * hi];
+  result[1] = hex2[2 * hi + 1];
+  result[2] = hex2[2 * lo];
+  result[3] = hex2[2 * lo + 1];
+  return result;
+}
+
 static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) {
   if (value == NULL)
     return "";
-  // Not sure how to handle unicode...
-  if (strnpbrk(value, "\"\\\b\f\n\r\t", length) == NULL &&
-      !containsControlCharacter0(value, length))
+
+  if (!isAnyCharRequiredQuoting(value, length))
     return JSONCPP_STRING("\"") + value + "\"";
   // We have to walk value and escape any special characters.
   // Appending to JSONCPP_STRING is not efficient, but this should be rare.
@@ -300,14 +296,24 @@
     // sequence.
     // Should add a flag to allow this compatibility mode and prevent this
     // sequence from occurring.
-    default:
-      if ((isControlCharacter(*c)) || (*c == 0)) {
-        JSONCPP_OSTRINGSTREAM oss;
-        oss << "\\u" << std::hex << std::uppercase << std::setfill('0')
-            << std::setw(4) << static_cast<int>(*c);
-        result += oss.str();
-      } else {
-        result += *c;
+    default: {
+        unsigned int cp = utf8ToCodepoint(c, end);
+        // don't escape non-control characters
+        // (short escape sequence are applied above)
+        if (cp < 0x80 && cp >= 0x20)
+          result += static_cast<char>(cp);
+        else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane
+          result += "\\u";
+          result += toHex16Bit(cp);
+        }
+        else { // codepoint is not in Basic Multilingual Plane
+               // convert to surrogate pair first
+          cp -= 0x10000;
+          result += "\\u";
+          result += toHex16Bit((cp >> 10) + 0xD800);
+          result += "\\u";
+          result += toHex16Bit((cp & 0x3FF) + 0xDC00);
+        }
       }
       break;
     }
@@ -316,6 +322,10 @@
   return result;
 }
 
+JSONCPP_STRING valueToQuotedString(const char* value) {
+  return valueToQuotedStringN(value, static_cast<unsigned int>(strlen(value)));
+}
+
 // Class Writer
 // //////////////////////////////////////////////////////////////////
 Writer::~Writer() {}
commit	42a161fc80b32cc63c1a3e7ab1c9ed588a83edaa	[log] [tgz]
author	Paweł Kierski <pkierski@gmail.com>	Wed Oct 04 03:19:20 2017 +0200
committer	Christopher Dunn <cdunn2001@gmail.com>	Tue Oct 03 18:19:20 2017 -0700
tree	6eac4c283519709f9eb251ef4fcade0866302a72
parent	a3a40593670d16a534a94a81063ff54201c9a63b [diff]