From 369671f028739178d22c53546d30d0ba87746985 Mon Sep 17 00:00:00 2001
From: Niels <niels.lohmann@gmail.com>
Date: Mon, 25 Jul 2016 16:05:58 +0200
Subject: [PATCH] added more Unicode tests

---
 README.md         |  2 +-
 test/src/unit.cpp | 81 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 24f6d7ab9..5adcf15ca 100644
--- a/README.md
+++ b/README.md
@@ -505,7 +505,7 @@ $ make
 $ ./json_unit "*"
 
 ===============================================================================
-All tests passed (5568718 assertions in 32 test cases)
+All tests passed (8905012 assertions in 32 test cases)
 ```
 
 For more information, have a look at the file [.travis.yml](https://github.com/nlohmann/json/blob/master/.travis.yml).
diff --git a/test/src/unit.cpp b/test/src/unit.cpp
index a1589949a..c2738faef 100644
--- a/test/src/unit.cpp
+++ b/test/src/unit.cpp
@@ -12285,19 +12285,23 @@ TEST_CASE("RFC 7159 examples")
 
 TEST_CASE("Unicode", "[hide]")
 {
-    SECTION("full enumeration of Unicode codepoints")
+    SECTION("full enumeration of Unicode code points")
     {
-        // create a string from a codepoint
-        auto codepoint_to_unicode = [](std::size_t cp)
+        // create an escaped string from a code point
+        const auto codepoint_to_unicode = [](std::size_t cp)
         {
-            char* buffer = new char[10];
-            sprintf(buffer, "\\u%04lx", cp);
-            std::string result(buffer);
-            delete[] buffer;
-            return result;
+            // copd points are represented as a six-character sequence: a
+            // reverse solidus, followed by the lowercase letter u, followed
+            // by four hexadecimal digits that encode the character's code
+            // point
+            std::stringstream ss;
+            ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
+            return ss.str();
         };
 
-        // generate all codepoints
+        // generate all UTF8 code points; in total, 1112064 code points are
+        // generated: 0x1FFFFF code points - 2047 invalid values between
+        // 0xD800 and 0xDFFF.
         for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
         {
             // The Unicode standard permanently reserves these code point
@@ -12307,34 +12311,57 @@ TEST_CASE("Unicode", "[hide]")
             // no UTF forms, including UTF-16, can encode these code points.
             if (cp >= 0xD800u and cp <= 0xDFFFu)
             {
+                // if we would not skip these code points, we would get a
+                // "missing low surrogate" exception
                 continue;
             }
 
-            std::string res;
+            // string to store the code point as in \uxxxx format
+            std::string escaped_string;
+            // string to store the code point as unescaped character sequence
+            std::string unescaped_string;
 
             if (cp < 0x10000u)
             {
-                // codepoint can be represented with 16 bit
-                res += codepoint_to_unicode(cp);
+                // code points in the Basic Multilingual Plane can be
+                // represented with one \\uxxxx sequence
+                escaped_string = codepoint_to_unicode(cp);
+
+                // All Unicode characters may be placed within the quotation
+                // marks, except for the characters that must be escaped:
+                // quotation mark, reverse solidus, and the control characters
+                // (U+0000 through U+001F); we ignore these code points as
+                // they are checked with codepoint_to_unicode.
+                if (cp > 0x1f and cp != 0x22 and cp != 0x5c)
+                {
+                    unescaped_string = json::lexer::to_unicode(cp);
+                }
             }
             else
             {
-                // codepoint can be represented with a pair
-                res += codepoint_to_unicode(0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu));
-                res += codepoint_to_unicode(0xdc00u + ((cp - 0x10000u) & 0x3ffu));
+                // To escape an extended character that is not in the Basic
+                // Multilingual Plane, the character is represented as a
+                // 12-character sequence, encoding the UTF-16 surrogate pair
+                const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
+                const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
+                escaped_string = codepoint_to_unicode(codepoint1);
+                escaped_string += codepoint_to_unicode(codepoint2);
+                unescaped_string += json::lexer::to_unicode(codepoint1, codepoint2);
             }
 
-            try
-            {
-                json j1, j2;
-                CHECK_NOTHROW(j1 = json::parse("\"" + res + "\""));
-                CHECK_NOTHROW(j2 = json::parse(j1.dump()));
-                CHECK(j1 == j2);
-            }
-            catch (std::invalid_argument)
-            {
-                // we ignore parsing errors
-            }
+            // all other code points are valid and must not yield parse errors
+            CAPTURE(cp);
+            CAPTURE(escaped_string);
+            CAPTURE(unescaped_string);
+
+            json j1, j2, j3, j4;
+            CHECK_NOTHROW(j1 = json::parse("\"" + escaped_string + "\""));
+            CHECK_NOTHROW(j2 = json::parse(j1.dump()));
+            CHECK(j1 == j2);
+
+            CHECK_NOTHROW(j3 = json::parse("\"" + unescaped_string + "\""));
+            CHECK_NOTHROW(j4 = json::parse(j3.dump()));
+            CHECK(j3 == j4);
         }
     }
 
@@ -12347,6 +12374,8 @@ TEST_CASE("Unicode", "[hide]")
         CHECK_NOTHROW(j << f);
 
         // the array has 1112064 + 1 elemnts (a terminating "null" value)
+        // Note: 1112064 = 0x1FFFFF code points - 2047 invalid values between
+        // 0xD800 and 0xDFFF.
         CHECK(j.size() == 1112065);
 
         SECTION("check JSON Pointers")