Merge branch 'feature/wstring' into develop

2025-08-06 18:01:00 +08:00 · 2018-04-29 17:29:57 +02:00 · 2018-04-29 17:29:57 +02:00 · 5bc4ff9da3
commit 5bc4ff9da3
parent b5d1755dfb fa3e42f826
4 changed files with 471 additions and 1 deletions
--- a/include/nlohmann/detail/input/input_adapters.hpp
+++ b/include/nlohmann/detail/input/input_adapters.hpp
@ -165,6 +165,176 @@ class input_buffer_adapter : public input_adapter_protocol
    const char* start;
 };

+template<typename WideStringType>
+class wide_string_input_adapter : public input_adapter_protocol
+{
+  public:
+    wide_string_input_adapter(const WideStringType& w) : str(w) {}
+
+    std::char_traits<char>::int_type get_character() noexcept override
+    {
+        // unget_character() was called previously: return the last character
+        if (next_unget)
+        {
+            next_unget = false;
+            return last_char;
+        }
+
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            if (sizeof(typename WideStringType::value_type) == 2)
+            {
+                fill_buffer_utf16();
+            }
+            else
+            {
+                fill_buffer_utf32();
+            }
+
+            assert(utf8_bytes_filled > 0);
+            assert(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        assert(utf8_bytes_filled > 0);
+        assert(utf8_bytes_index < utf8_bytes_filled);
+        return (last_char = utf8_bytes[utf8_bytes_index++]);
+    }
+
+    void unget_character() noexcept override
+    {
+        next_unget = true;
+    }
+
+  private:
+    void fill_buffer_utf16()
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const int wc = static_cast<int>(str[current_wchar++]);
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = 0xC0 | ((wc >> 6));
+                utf8_bytes[1] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc or wc >= 0xE000)
+            {
+                utf8_bytes[0] = 0xE0 | ((wc >> 12));
+                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[2] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (current_wchar < str.size())
+                {
+                    const int wc2 = static_cast<int>(str[current_wchar++]);
+                    const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
+                    utf8_bytes[0] = 0xf0 | (charcode >> 18);
+                    utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
+                    utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
+                    utf8_bytes[3] = 0x80 | (charcode & 0x3F);
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    // unknown character
+                    ++current_wchar;
+                    utf8_bytes[0] = wc;
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+
+    void fill_buffer_utf32()
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const int wc = static_cast<int>(str[current_wchar++]);
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
+                utf8_bytes[1] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
+                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[2] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
+                utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
+                utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[3] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+
+  private:
+    /// the wstring to process
+    const WideStringType& str;
+
+    /// index of the current wchar in str
+    std::size_t current_wchar = 0;
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+
+    /// the last character (returned after unget_character() is called)
+    std::char_traits<char>::int_type last_char = 0;
+    /// whether get_character() should return last_char
+    bool next_unget = false;
+};
+
 class input_adapter
 {
  public:
@ -178,6 +348,15 @@ class input_adapter
    input_adapter(std::istream&& i)
        : ia(std::make_shared<input_stream_adapter>(i)) {}

+    input_adapter(const std::wstring& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
+
+    input_adapter(const std::u16string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
+
+    input_adapter(const std::u32string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
+
    /// input adapter for buffer
    template<typename CharT,
             typename std::enable_if<
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@ -1738,6 +1738,176 @@ class input_buffer_adapter : public input_adapter_protocol
    const char* start;
 };

+template<typename WideStringType>
+class wide_string_input_adapter : public input_adapter_protocol
+{
+  public:
+    wide_string_input_adapter(const WideStringType& w) : str(w) {}
+
+    std::char_traits<char>::int_type get_character() noexcept override
+    {
+        // unget_character() was called previously: return the last character
+        if (next_unget)
+        {
+            next_unget = false;
+            return last_char;
+        }
+
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            if (sizeof(typename WideStringType::value_type) == 2)
+            {
+                fill_buffer_utf16();
+            }
+            else
+            {
+                fill_buffer_utf32();
+            }
+
+            assert(utf8_bytes_filled > 0);
+            assert(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        assert(utf8_bytes_filled > 0);
+        assert(utf8_bytes_index < utf8_bytes_filled);
+        return (last_char = utf8_bytes[utf8_bytes_index++]);
+    }
+
+    void unget_character() noexcept override
+    {
+        next_unget = true;
+    }
+
+  private:
+    void fill_buffer_utf16()
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const int wc = static_cast<int>(str[current_wchar++]);
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = 0xC0 | ((wc >> 6));
+                utf8_bytes[1] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc or wc >= 0xE000)
+            {
+                utf8_bytes[0] = 0xE0 | ((wc >> 12));
+                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[2] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (current_wchar < str.size())
+                {
+                    const int wc2 = static_cast<int>(str[current_wchar++]);
+                    const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
+                    utf8_bytes[0] = 0xf0 | (charcode >> 18);
+                    utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
+                    utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
+                    utf8_bytes[3] = 0x80 | (charcode & 0x3F);
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    // unknown character
+                    ++current_wchar;
+                    utf8_bytes[0] = wc;
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+
+    void fill_buffer_utf32()
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const int wc = static_cast<int>(str[current_wchar++]);
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
+                utf8_bytes[1] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
+                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[2] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
+                utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
+                utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[3] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+
+  private:
+    /// the wstring to process
+    const WideStringType& str;
+
+    /// index of the current wchar in str
+    std::size_t current_wchar = 0;
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+
+    /// the last character (returned after unget_character() is called)
+    std::char_traits<char>::int_type last_char = 0;
+    /// whether get_character() should return last_char
+    bool next_unget = false;
+};
+
 class input_adapter
 {
  public:
@ -1751,6 +1921,15 @@ class input_adapter
    input_adapter(std::istream&& i)
        : ia(std::make_shared<input_stream_adapter>(i)) {}

+    input_adapter(const std::wstring& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
+
+    input_adapter(const std::u16string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
+
+    input_adapter(const std::u32string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
+
    /// input adapter for buffer
    template<typename CharT,
             typename std::enable_if<
--- a/test/Makefile
+++ b/test/Makefile
@ -42,7 +42,8 @@ SOURCES = src/unit.cpp \
          src/unit-serialization.cpp \
          src/unit-testsuites.cpp \
          src/unit-ubjson.cpp \
-          src/unit-unicode.cpp
+          src/unit-unicode.cpp \
+          src/unit-wstring.cpp

 OBJECTS = $(SOURCES:.cpp=.o)

--- a/test/src/unit-wstring.cpp
+++ b/test/src/unit-wstring.cpp
@ -0,0 +1,111 @@
+/*
+    __ _____ _____ _____
+ __|  |   __|     |   | |  JSON for Modern C++ (test suite)
+|  |  |__   |  |  | | | |  version 3.1.2
+|_____|_____|_____|_|___|  https://github.com/nlohmann/json
+
+Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+Copyright (c) 2013-2018 Niels Lohmann <http://nlohmann.me>.
+
+Permission is hereby  granted, free of charge, to any  person obtaining a copy
+of this software and associated  documentation files (the "Software"), to deal
+in the Software  without restriction, including without  limitation the rights
+to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
+copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
+IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
+FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
+AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
+LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "catch.hpp"
+
+#include <nlohmann/json.hpp>
+
+using nlohmann::json;
+
+bool wstring_is_utf16();
+bool wstring_is_utf16()
+{
+    return (std::wstring(L"💩") == std::wstring(L"\U0001F4A9"));
+}
+
+bool u16string_is_utf16();
+bool u16string_is_utf16()
+{
+    return (std::u16string(u"💩") == std::u16string(u"\U0001F4A9"));
+}
+
+bool u32string_is_utf32();
+bool u32string_is_utf32()
+{
+    return (std::u32string(U"💩") == std::u32string(U"\U0001F4A9"));
+}
+
+TEST_CASE("wide strings")
+{
+    SECTION("std::wstring")
+    {
+        if (wstring_is_utf16())
+        {
+            std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]";
+            json j = json::parse(w);
+            CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
+        }
+    }
+
+    SECTION("invalid std::wstring")
+    {
+        if (wstring_is_utf16())
+        {
+            std::wstring w = L"\"\xDBFF";
+            CHECK_THROWS_AS(json::parse(w), json::parse_error&);
+        }
+    }
+
+    SECTION("std::u16string")
+    {
+        if (u16string_is_utf16())
+        {
+            std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]";
+            json j = json::parse(w);
+            CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
+        }
+    }
+
+    SECTION("invalid std::u16string")
+    {
+        if (wstring_is_utf16())
+        {
+            std::u16string w = u"\"\xDBFF";
+            CHECK_THROWS_AS(json::parse(w), json::parse_error&);
+        }
+    }
+
+    SECTION("std::u32string")
+    {
+        if (u32string_is_utf32())
+        {
+            std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]";
+            json j = json::parse(w);
+            CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
+        }
+    }
+
+    SECTION("invalid std::u32string")
+    {
+        if (u32string_is_utf32())
+        {
+            std::u32string w = U"\"\x110000";
+            CHECK_THROWS_AS(json::parse(w), json::parse_error&);
+        }
+    }
+}