Serialize UTF-8 string with Unicode escapes (#687)
Squashed and merged.
diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index 802f96c..0852b48 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -81,24 +81,6 @@
typedef std::auto_ptr<StreamWriter> StreamWriterPtr;
#endif
-static bool containsControlCharacter(const char* str) {
- while (*str) {
- if (isControlCharacter(*(str++)))
- return true;
- }
- return false;
-}
-
-static bool containsControlCharacter0(const char* str, unsigned len) {
- char const* end = str + len;
- while (end != str) {
- if (isControlCharacter(*str) || 0==*str)
- return true;
- ++str;
- }
- return false;
-}
-
JSONCPP_STRING valueToString(LargestInt value) {
UIntToStringBuffer buffer;
char* current = buffer + sizeof(buffer);
@@ -176,89 +158,103 @@
JSONCPP_STRING valueToString(bool value) { return value ? "true" : "false"; }
-JSONCPP_STRING valueToQuotedString(const char* value) {
- if (value == NULL)
- return "";
- // Not sure how to handle unicode...
- if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL &&
- !containsControlCharacter(value))
- return JSONCPP_STRING("\"") + value + "\"";
- // We have to walk value and escape any special characters.
- // Appending to JSONCPP_STRING is not efficient, but this should be rare.
- // (Note: forward slashes are *not* rare, but I am not escaping them.)
- JSONCPP_STRING::size_type maxsize =
- strlen(value) * 2 + 3; // allescaped+quotes+NULL
- JSONCPP_STRING result;
- result.reserve(maxsize); // to avoid lots of mallocs
- result += "\"";
- for (const char* c = value; *c != 0; ++c) {
- switch (*c) {
- case '\"':
- result += "\\\"";
- break;
- case '\\':
- result += "\\\\";
- break;
- case '\b':
- result += "\\b";
- break;
- case '\f':
- result += "\\f";
- break;
- case '\n':
- result += "\\n";
- break;
- case '\r':
- result += "\\r";
- break;
- case '\t':
- result += "\\t";
- break;
- // case '/':
- // Even though \/ is considered a legal escape in JSON, a bare
- // slash is also legal, so I see no reason to escape it.
- // (I hope I am not misunderstanding something.
- // blep notes: actually escaping \/ may be useful in javascript to avoid </
- // sequence.
- // Should add a flag to allow this compatibility mode and prevent this
- // sequence from occurring.
- default:
- if (isControlCharacter(*c)) {
- JSONCPP_OSTRINGSTREAM oss;
- oss << "\\u" << std::hex << std::uppercase << std::setfill('0')
- << std::setw(4) << static_cast<int>(*c);
- result += oss.str();
- } else {
- result += *c;
- }
- break;
- }
- }
- result += "\"";
- return result;
-}
-
-// https://github.com/upcaste/upcaste/blob/master/src/upcore/src/cstring/strnpbrk.cpp
-static char const* strnpbrk(char const* s, char const* accept, size_t n) {
- assert((s || !n) && accept);
+static bool isAnyCharRequiredQuoting(char const* s, size_t n) {
+ assert(s || !n);
char const* const end = s + n;
for (char const* cur = s; cur < end; ++cur) {
- int const c = *cur;
- for (char const* a = accept; *a; ++a) {
- if (*a == c) {
- return cur;
- }
- }
+ if (*cur == '\\' || *cur == '\"' || *cur < ' '
+ || static_cast<unsigned char>(*cur) < 0x80)
+ return true;
}
- return NULL;
+ return false;
}
+
+static unsigned int utf8ToCodepoint(const char*& s, const char* e) {
+ const unsigned int REPLACEMENT_CHARACTER = 0xFFFD;
+
+ unsigned int firstByte = static_cast<unsigned char>(*s);
+
+ if (firstByte < 0x80)
+ return firstByte;
+
+ if (firstByte < 0xE0) {
+ if (e - s < 2)
+ return REPLACEMENT_CHARACTER;
+
+ unsigned int calculated = ((firstByte & 0x1F) << 6)
+ | (static_cast<unsigned int>(s[1]) & 0x3F);
+ s += 1;
+ // oversized encoded characters are invalid
+ return calculated < 0x80 ? REPLACEMENT_CHARACTER : calculated;
+ }
+
+ if (firstByte < 0xF0) {
+ if (e - s < 3)
+ return REPLACEMENT_CHARACTER;
+
+ unsigned int calculated = ((firstByte & 0x0F) << 12)
+ | ((static_cast<unsigned int>(s[1]) & 0x3F) << 6)
+ | (static_cast<unsigned int>(s[2]) & 0x3F);
+ s += 2;
+ // surrogates aren't valid codepoints itself
+ // shouldn't be UTF-8 encoded
+ if (calculated >= 0xD800 && calculated >= 0xDFFF)
+ return REPLACEMENT_CHARACTER;
+ // oversized encoded characters are invalid
+ return calculated < 0x800 ? REPLACEMENT_CHARACTER : calculated;
+ }
+
+ if (firstByte < 0xF8) {
+ if (e - s < 4)
+ return REPLACEMENT_CHARACTER;
+
+ unsigned int calculated = ((firstByte & 0x07) << 24)
+ | ((static_cast<unsigned int>(s[1]) & 0x3F) << 12)
+ | ((static_cast<unsigned int>(s[2]) & 0x3F) << 6)
+ | (static_cast<unsigned int>(s[3]) & 0x3F);
+ s += 3;
+ // oversized encoded characters are invalid
+ return calculated < 0x10000 ? REPLACEMENT_CHARACTER : calculated;
+ }
+
+ return REPLACEMENT_CHARACTER;
+}
+
+static const char hex2[] =
+ "000102030405060708090a0b0c0d0e0f"
+ "101112131415161718191a1b1c1d1e1f"
+ "202122232425262728292a2b2c2d2e2f"
+ "303132333435363738393a3b3c3d3e3f"
+ "404142434445464748494a4b4c4d4e4f"
+ "505152535455565758595a5b5c5d5e5f"
+ "606162636465666768696a6b6c6d6e6f"
+ "707172737475767778797a7b7c7d7e7f"
+ "808182838485868788898a8b8c8d8e8f"
+ "909192939495969798999a9b9c9d9e9f"
+ "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
+ "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
+ "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
+ "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
+ "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
+ "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
+
+static JSONCPP_STRING toHex16Bit(unsigned int x) {
+ const unsigned int hi = (x >> 8) & 0xff;
+ const unsigned int lo = x & 0xff;
+ JSONCPP_STRING result(4, ' ');
+ result[0] = hex2[2 * hi];
+ result[1] = hex2[2 * hi + 1];
+ result[2] = hex2[2 * lo];
+ result[3] = hex2[2 * lo + 1];
+ return result;
+}
+
static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) {
if (value == NULL)
return "";
- // Not sure how to handle unicode...
- if (strnpbrk(value, "\"\\\b\f\n\r\t", length) == NULL &&
- !containsControlCharacter0(value, length))
+
+ if (!isAnyCharRequiredQuoting(value, length))
return JSONCPP_STRING("\"") + value + "\"";
// We have to walk value and escape any special characters.
// Appending to JSONCPP_STRING is not efficient, but this should be rare.
@@ -300,14 +296,24 @@
// sequence.
// Should add a flag to allow this compatibility mode and prevent this
// sequence from occurring.
- default:
- if ((isControlCharacter(*c)) || (*c == 0)) {
- JSONCPP_OSTRINGSTREAM oss;
- oss << "\\u" << std::hex << std::uppercase << std::setfill('0')
- << std::setw(4) << static_cast<int>(*c);
- result += oss.str();
- } else {
- result += *c;
+ default: {
+ unsigned int cp = utf8ToCodepoint(c, end);
+ // don't escape non-control characters
+ // (short escape sequence are applied above)
+ if (cp < 0x80 && cp >= 0x20)
+ result += static_cast<char>(cp);
+ else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane
+ result += "\\u";
+ result += toHex16Bit(cp);
+ }
+ else { // codepoint is not in Basic Multilingual Plane
+ // convert to surrogate pair first
+ cp -= 0x10000;
+ result += "\\u";
+ result += toHex16Bit((cp >> 10) + 0xD800);
+ result += "\\u";
+ result += toHex16Bit((cp & 0x3FF) + 0xDC00);
+ }
}
break;
}
@@ -316,6 +322,10 @@
return result;
}
+JSONCPP_STRING valueToQuotedString(const char* value) {
+ return valueToQuotedStringN(value, static_cast<unsigned int>(strlen(value)));
+}
+
// Class Writer
// //////////////////////////////////////////////////////////////////
Writer::~Writer() {}