From 369671f028739178d22c53546d30d0ba87746985 Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 25 Jul 2016 16:05:58 +0200 Subject: [PATCH] added more Unicode tests --- README.md | 2 +- test/src/unit.cpp | 81 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 56 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 24f6d7ab9..5adcf15ca 100644 --- a/README.md +++ b/README.md @@ -505,7 +505,7 @@ $ make $ ./json_unit "*" =============================================================================== -All tests passed (5568718 assertions in 32 test cases) +All tests passed (8905012 assertions in 32 test cases) ``` For more information, have a look at the file [.travis.yml](https://github.com/nlohmann/json/blob/master/.travis.yml). diff --git a/test/src/unit.cpp b/test/src/unit.cpp index a1589949a..c2738faef 100644 --- a/test/src/unit.cpp +++ b/test/src/unit.cpp @@ -12285,19 +12285,23 @@ TEST_CASE("RFC 7159 examples") TEST_CASE("Unicode", "[hide]") { - SECTION("full enumeration of Unicode codepoints") + SECTION("full enumeration of Unicode code points") { - // create a string from a codepoint - auto codepoint_to_unicode = [](std::size_t cp) + // create an escaped string from a code point + const auto codepoint_to_unicode = [](std::size_t cp) { - char* buffer = new char[10]; - sprintf(buffer, "\\u%04lx", cp); - std::string result(buffer); - delete[] buffer; - return result; + // copd points are represented as a six-character sequence: a + // reverse solidus, followed by the lowercase letter u, followed + // by four hexadecimal digits that encode the character's code + // point + std::stringstream ss; + ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp; + return ss.str(); }; - // generate all codepoints + // generate all UTF8 code points; in total, 1112064 code points are + // generated: 0x1FFFFF code points - 2047 invalid values between + // 0xD800 and 0xDFFF. for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp) { // The Unicode standard permanently reserves these code point @@ -12307,34 +12311,57 @@ TEST_CASE("Unicode", "[hide]") // no UTF forms, including UTF-16, can encode these code points. if (cp >= 0xD800u and cp <= 0xDFFFu) { + // if we would not skip these code points, we would get a + // "missing low surrogate" exception continue; } - std::string res; + // string to store the code point as in \uxxxx format + std::string escaped_string; + // string to store the code point as unescaped character sequence + std::string unescaped_string; if (cp < 0x10000u) { - // codepoint can be represented with 16 bit - res += codepoint_to_unicode(cp); + // code points in the Basic Multilingual Plane can be + // represented with one \\uxxxx sequence + escaped_string = codepoint_to_unicode(cp); + + // All Unicode characters may be placed within the quotation + // marks, except for the characters that must be escaped: + // quotation mark, reverse solidus, and the control characters + // (U+0000 through U+001F); we ignore these code points as + // they are checked with codepoint_to_unicode. + if (cp > 0x1f and cp != 0x22 and cp != 0x5c) + { + unescaped_string = json::lexer::to_unicode(cp); + } } else { - // codepoint can be represented with a pair - res += codepoint_to_unicode(0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu)); - res += codepoint_to_unicode(0xdc00u + ((cp - 0x10000u) & 0x3ffu)); + // To escape an extended character that is not in the Basic + // Multilingual Plane, the character is represented as a + // 12-character sequence, encoding the UTF-16 surrogate pair + const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu); + const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu); + escaped_string = codepoint_to_unicode(codepoint1); + escaped_string += codepoint_to_unicode(codepoint2); + unescaped_string += json::lexer::to_unicode(codepoint1, codepoint2); } - try - { - json j1, j2; - CHECK_NOTHROW(j1 = json::parse("\"" + res + "\"")); - CHECK_NOTHROW(j2 = json::parse(j1.dump())); - CHECK(j1 == j2); - } - catch (std::invalid_argument) - { - // we ignore parsing errors - } + // all other code points are valid and must not yield parse errors + CAPTURE(cp); + CAPTURE(escaped_string); + CAPTURE(unescaped_string); + + json j1, j2, j3, j4; + CHECK_NOTHROW(j1 = json::parse("\"" + escaped_string + "\"")); + CHECK_NOTHROW(j2 = json::parse(j1.dump())); + CHECK(j1 == j2); + + CHECK_NOTHROW(j3 = json::parse("\"" + unescaped_string + "\"")); + CHECK_NOTHROW(j4 = json::parse(j3.dump())); + CHECK(j3 == j4); } } @@ -12347,6 +12374,8 @@ TEST_CASE("Unicode", "[hide]") CHECK_NOTHROW(j << f); // the array has 1112064 + 1 elemnts (a terminating "null" value) + // Note: 1112064 = 0x1FFFFF code points - 2047 invalid values between + // 0xD800 and 0xDFFF. CHECK(j.size() == 1112065); SECTION("check JSON Pointers")