Added emitUTF8 setting. (#1045)
* Added emitUTF8 setting to emit UTF8 format JSON.
* Added a test for emitUTF8, with it in default, on and off states.
* Review comments addressed.
* Merged master into my branch & resolved conflicts.
* Fix clang-format errors.
* Fix clang-format errors.
* Fixed clang-format errors.
* Fixed clang-format errors.
diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index e16d84f..519ce23 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -264,7 +264,8 @@
return result;
}
-static String valueToQuotedStringN(const char* value, unsigned length) {
+static String valueToQuotedStringN(const char* value, unsigned length,
+ bool emitUTF8 = false) {
if (value == nullptr)
return "";
@@ -310,21 +311,31 @@
// Should add a flag to allow this compatibility mode and prevent this
// sequence from occurring.
default: {
- unsigned int cp = utf8ToCodepoint(c, end);
- // don't escape non-control characters
- // (short escape sequence are applied above)
- if (cp < 0x80 && cp >= 0x20)
- result += static_cast<char>(cp);
- else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane
- result += "\\u";
- result += toHex16Bit(cp);
- } else { // codepoint is not in Basic Multilingual Plane
- // convert to surrogate pair first
- cp -= 0x10000;
- result += "\\u";
- result += toHex16Bit((cp >> 10) + 0xD800);
- result += "\\u";
- result += toHex16Bit((cp & 0x3FF) + 0xDC00);
+ if (emitUTF8) {
+ result += *c;
+ } else {
+ unsigned int codepoint = utf8ToCodepoint(c, end);
+ const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
+ const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
+ const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
+ // don't escape non-control characters
+ // (short escape sequence are applied above)
+ if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
+ codepoint <= LAST_NON_CONTROL_CODEPOINT) {
+ result += static_cast<char>(codepoint);
+ } else if (codepoint <
+ FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
+ // Multilingual Plane
+ result += "\\u";
+ result += toHex16Bit(codepoint);
+ } else { // codepoint is not in Basic Multilingual Plane
+ // convert to surrogate pair first
+ codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
+ result += "\\u";
+ result += toHex16Bit((codepoint >> 10) + 0xD800);
+ result += "\\u";
+ result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
+ }
}
} break;
}
@@ -864,7 +875,8 @@
BuiltStyledStreamWriter(String indentation, CommentStyle::Enum cs,
String colonSymbol, String nullSymbol,
String endingLineFeedSymbol, bool useSpecialFloats,
- unsigned int precision, PrecisionType precisionType);
+ bool emitUTF8, unsigned int precision,
+ PrecisionType precisionType);
int write(Value const& root, OStream* sout) override;
private:
@@ -893,19 +905,20 @@
bool addChildValues_ : 1;
bool indented_ : 1;
bool useSpecialFloats_ : 1;
+ bool emitUTF8_ : 1;
unsigned int precision_;
PrecisionType precisionType_;
};
BuiltStyledStreamWriter::BuiltStyledStreamWriter(
String indentation, CommentStyle::Enum cs, String colonSymbol,
String nullSymbol, String endingLineFeedSymbol, bool useSpecialFloats,
- unsigned int precision, PrecisionType precisionType)
+ bool emitUTF8, unsigned int precision, PrecisionType precisionType)
: rightMargin_(74), indentation_(std::move(indentation)), cs_(cs),
colonSymbol_(std::move(colonSymbol)), nullSymbol_(std::move(nullSymbol)),
endingLineFeedSymbol_(std::move(endingLineFeedSymbol)),
addChildValues_(false), indented_(false),
- useSpecialFloats_(useSpecialFloats), precision_(precision),
- precisionType_(precisionType) {}
+ useSpecialFloats_(useSpecialFloats), emitUTF8_(emitUTF8),
+ precision_(precision), precisionType_(precisionType) {}
int BuiltStyledStreamWriter::write(Value const& root, OStream* sout) {
sout_ = sout;
addChildValues_ = false;
@@ -942,7 +955,8 @@
char const* end;
bool ok = value.getString(&str, &end);
if (ok)
- pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str)));
+ pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str),
+ emitUTF8_));
else
pushValue("");
break;
@@ -966,7 +980,7 @@
Value const& childValue = value[name];
writeCommentBeforeValue(childValue);
writeWithIndent(valueToQuotedStringN(
- name.data(), static_cast<unsigned>(name.length())));
+ name.data(), static_cast<unsigned>(name.length()), emitUTF8_));
*sout_ << colonSymbol_;
writeValue(childValue);
if (++it == members.end()) {
@@ -1142,12 +1156,13 @@
StreamWriterBuilder::StreamWriterBuilder() { setDefaults(&settings_); }
StreamWriterBuilder::~StreamWriterBuilder() = default;
StreamWriter* StreamWriterBuilder::newStreamWriter() const {
- String indentation = settings_["indentation"].asString();
- String cs_str = settings_["commentStyle"].asString();
- String pt_str = settings_["precisionType"].asString();
- bool eyc = settings_["enableYAMLCompatibility"].asBool();
- bool dnp = settings_["dropNullPlaceholders"].asBool();
- bool usf = settings_["useSpecialFloats"].asBool();
+ const String indentation = settings_["indentation"].asString();
+ const String cs_str = settings_["commentStyle"].asString();
+ const String pt_str = settings_["precisionType"].asString();
+ const bool eyc = settings_["enableYAMLCompatibility"].asBool();
+ const bool dnp = settings_["dropNullPlaceholders"].asBool();
+ const bool usf = settings_["useSpecialFloats"].asBool();
+ const bool emitUTF8 = settings_["emitUTF8"].asBool();
unsigned int pre = settings_["precision"].asUInt();
CommentStyle::Enum cs = CommentStyle::All;
if (cs_str == "All") {
@@ -1179,7 +1194,7 @@
pre = 17;
String endingLineFeedSymbol;
return new BuiltStyledStreamWriter(indentation, cs, colonSymbol, nullSymbol,
- endingLineFeedSymbol, usf, pre,
+ endingLineFeedSymbol, usf, emitUTF8, pre,
precisionType);
}
static void getValidWriterKeys(std::set<String>* valid_keys) {
@@ -1189,6 +1204,7 @@
valid_keys->insert("enableYAMLCompatibility");
valid_keys->insert("dropNullPlaceholders");
valid_keys->insert("useSpecialFloats");
+ valid_keys->insert("emitUTF8");
valid_keys->insert("precision");
valid_keys->insert("precisionType");
}
@@ -1220,6 +1236,7 @@
(*settings)["enableYAMLCompatibility"] = false;
(*settings)["dropNullPlaceholders"] = false;
(*settings)["useSpecialFloats"] = false;
+ (*settings)["emitUTF8"] = false;
(*settings)["precision"] = 17;
(*settings)["precisionType"] = "significant";
//! [StreamWriterBuilderDefaults]
diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp
index f32a11f..326519f 100644
--- a/src/test_lib_json/main.cpp
+++ b/src/test_lib_json/main.cpp
@@ -2481,6 +2481,35 @@
}
}
+JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
+ // Create a Json value containing UTF-8 string with some chars that need
+ // escape (tab,newline).
+ Json::Value root;
+ root["test"] = "\t\n\xF0\x91\xA2\xA1\x3D\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7";
+
+ Json::StreamWriterBuilder b;
+
+ // Default settings - should be unicode escaped.
+ JSONTEST_ASSERT(Json::writeString(b, root) ==
+ "{\n\t\"test\" : "
+ "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
+
+ b.settings_["emitUTF8"] = true;
+
+ // Should not be unicode escaped.
+ JSONTEST_ASSERT(
+ Json::writeString(b, root) ==
+ "{\n\t\"test\" : "
+ "\"\\t\\n\xF0\x91\xA2\xA1=\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7\"\n}");
+
+ b.settings_["emitUTF8"] = false;
+
+ // Should be unicode escaped.
+ JSONTEST_ASSERT(Json::writeString(b, root) ==
+ "{\n\t\"test\" : "
+ "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
+}
+
struct ReaderTest : JsonTest::TestCase {};
JSONTEST_FIXTURE_LOCAL(ReaderTest, parseWithNoErrors) {