Implementing support for reading and writing Unicode escape sequences.
diff --git a/include/json/reader.h b/include/json/reader.h
index f1bc5a2..e113569 100644
--- a/include/json/reader.h
+++ b/include/json/reader.h
@@ -115,6 +115,10 @@
bool decodeString( Token &token );
bool decodeString( Token &token, std::string &decoded );
bool decodeDouble( Token &token );
+ bool decodeUnicodeCodePoint( Token &token,
+ Location ¤t,
+ Location end,
+ unsigned int &unicode );
bool decodeUnicodeEscapeSequence( Token &token,
Location ¤t,
Location end,
diff --git a/src/lib_json/json_reader.cpp b/src/lib_json/json_reader.cpp
index 9869686..0e0c2ff 100644
--- a/src/lib_json/json_reader.cpp
+++ b/src/lib_json/json_reader.cpp
@@ -36,6 +36,42 @@
return false;
}
+static std::string codePointToUTF8(unsigned int cp)
+{
+ std::string result;
+
+ // based on description from http://en.wikipedia.org/wiki/UTF-8
+
+ if (cp <= 0x7f)
+ {
+ result.resize(1);
+ result[0] = static_cast<char>(cp);
+ }
+ else if (cp <= 0x7FF)
+ {
+ result.resize(2);
+ result[1] = static_cast<char>(0x80 | (0x3f & cp));
+ result[0] = static_cast<char>(0xC0 | (0x1f & (cp >> 6)));
+ }
+ else if (cp <= 0xFFFF)
+ {
+ result.resize(3);
+ result[2] = static_cast<char>(0x80 | (0x3f & cp));
+ result[1] = 0x80 | static_cast<char>((0x3f & (cp >> 6)));
+ result[0] = 0xE0 | static_cast<char>((0xf & (cp >> 12)));
+ }
+ else if (cp <= 0x10FFFF)
+ {
+ result.resize(4);
+ result[3] = static_cast<char>(0x80 | (0x3f & cp));
+ result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
+ result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
+ result[0] = static_cast<char>(0xF0 | (0x7 & (cp >> 18)));
+ }
+
+ return result;
+}
+
// Class Reader
// //////////////////////////////////////////////////////////////////
@@ -577,10 +613,9 @@
case 'u':
{
unsigned int unicode;
- if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
+ if ( !decodeUnicodeCodePoint( token, current, end, unicode ) )
return false;
- // @todo encode unicode as utf8.
- // @todo remember to alter the writer too.
+ decoded += codePointToUTF8(unicode);
}
break;
default:
@@ -595,6 +630,35 @@
return true;
}
+bool
+Reader::decodeUnicodeCodePoint( Token &token,
+ Location ¤t,
+ Location end,
+ unsigned int &unicode )
+{
+
+ if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
+ return false;
+ if (unicode >= 0xD800 && unicode <= 0xDBFF)
+ {
+ // surrogate pairs
+ if (end - current < 6)
+ return addError( "additional six characters expected to parse unicode surrogate pair.", token, current );
+ unsigned int surrogatePair;
+ if (*(current++) == '\\' && *(current++)== 'u')
+ {
+ if (decodeUnicodeEscapeSequence( token, current, end, surrogatePair ))
+ {
+ unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
+ }
+ else
+ return false;
+ }
+ else
+ return addError( "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current );
+ }
+ return true;
+}
bool
Reader::decodeUnicodeEscapeSequence( Token &token,
diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index 9f2145a..111caac 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -4,6 +4,8 @@
#include <stdio.h>
#include <string.h>
#include <iostream>
+#include <sstream>
+#include <iomanip>
#if _MSC_VER >= 1400 // VC++ 8.0
#pragma warning( disable : 4996 ) // disable warning about strdup being deprecated.
@@ -11,6 +13,20 @@
namespace Json {
+static bool isControlCharacter(char ch)
+{
+ return ch > 0 && ch <= 0x1F;
+}
+
+static bool containsControlCharacter( const char* str )
+{
+ while ( str )
+ {
+ if ( isControlCharacter( *(str++) ) )
+ return true;
+ }
+ return false;
+}
static void uintToString( unsigned int value,
char *¤t )
{
@@ -95,7 +111,7 @@
std::string valueToQuotedString( const char *value )
{
// Not sure how to handle unicode...
- if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL)
+ if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL && !containsControlCharacter( value ))
return std::string("\"") + value + "\"";
// We have to walk value and escape any special characters.
// Appending to std::string is not efficient, but this should be rare.
@@ -132,8 +148,16 @@
// slash is also legal, so I see no reason to escape it.
// (I hope I am not misunderstanding something.)
default:
- result += *c;
+ if ( isControlCharacter( *c ) )
+ {
+ std::ostringstream oss;
+ oss << "\\u" << std::hex << std::uppercase << std::setfill('0') << std::setw(4) << static_cast<int>(*c);
+ result += oss.str();
+ }
+ else
+ result += *c;
}
+ break;
}
result += "\"";
return result;
diff --git a/test/test_string_unicode_01.expected b/test/test_string_unicode_01.expected
new file mode 100644
index 0000000..447f85a
--- /dev/null
+++ b/test/test_string_unicode_01.expected
@@ -0,0 +1 @@
+.="a"
diff --git a/test/test_string_unicode_01.json b/test/test_string_unicode_01.json
new file mode 100644
index 0000000..024114b
--- /dev/null
+++ b/test/test_string_unicode_01.json
@@ -0,0 +1 @@
+"\u0061"
\ No newline at end of file
diff --git a/test/test_string_unicode_02.expected b/test/test_string_unicode_02.expected
new file mode 100644
index 0000000..c0b3b43
--- /dev/null
+++ b/test/test_string_unicode_02.expected
@@ -0,0 +1 @@
+.="¢"
diff --git a/test/test_string_unicode_02.json b/test/test_string_unicode_02.json
new file mode 100644
index 0000000..4961024
--- /dev/null
+++ b/test/test_string_unicode_02.json
@@ -0,0 +1 @@
+"\u00A2"
\ No newline at end of file
diff --git a/test/test_string_unicode_03.expected b/test/test_string_unicode_03.expected
new file mode 100644
index 0000000..7289743
--- /dev/null
+++ b/test/test_string_unicode_03.expected
@@ -0,0 +1 @@
+.="€"
diff --git a/test/test_string_unicode_03.json b/test/test_string_unicode_03.json
new file mode 100644
index 0000000..e7e1a9e
--- /dev/null
+++ b/test/test_string_unicode_03.json
@@ -0,0 +1 @@
+"\u20AC"
\ No newline at end of file
diff --git a/test/test_string_unicode_04.expected b/test/test_string_unicode_04.expected
new file mode 100644
index 0000000..868fbc3
--- /dev/null
+++ b/test/test_string_unicode_04.expected
@@ -0,0 +1 @@
+.="𝄞"
diff --git a/test/test_string_unicode_04.json b/test/test_string_unicode_04.json
new file mode 100644
index 0000000..dae65c5
--- /dev/null
+++ b/test/test_string_unicode_04.json
@@ -0,0 +1 @@
+"\uD834\uDD1E"
\ No newline at end of file