diff --git a/docs/mkdocs/docs/api/basic_json/dump.md b/docs/mkdocs/docs/api/basic_json/dump.md index 41adb154d..30388f660 100644 --- a/docs/mkdocs/docs/api/basic_json/dump.md +++ b/docs/mkdocs/docs/api/basic_json/dump.md @@ -25,10 +25,11 @@ and `ensure_ascii` parameters. result consists of ASCII characters only. `error_handler` (in) -: how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md): - `strict` (throws and exception in case a decoding error occurs; default), `replace` (replace invalid UTF-8 sequences - with U+FFFD), and `ignore` (ignore invalid UTF-8 sequences during serialization; all bytes are copied to the output - unchanged)). +: how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md)): +: - `strict`: throws a [`type_error`](../../home/exceptions.md#type-errors) exception in case a decoding error occurs (this is the default), + - `replace`: replace invalid UTF-8 sequences with U+FFFD (� REPLACEMENT CHARACTER), + - `ignore`: ignore invalid UTF-8 sequences during serialization, and + - `keep`: keep invalid UTF-8 sequences during serialization; all bytes are copied to the output unchanged ## Return value @@ -77,3 +78,4 @@ Binary values are serialized as object containing two keys: - Indentation character `indent_char`, option `ensure_ascii` and exceptions added in version 3.0.0. - Error handlers added in version 3.4.0. - Serialization of binary values added in version 3.8.0. +- Added support for error handler value `keep` in version ???. diff --git a/docs/mkdocs/docs/api/basic_json/error_handler_t.md b/docs/mkdocs/docs/api/basic_json/error_handler_t.md index dc32ced9b..1b3c7456a 100644 --- a/docs/mkdocs/docs/api/basic_json/error_handler_t.md +++ b/docs/mkdocs/docs/api/basic_json/error_handler_t.md @@ -12,13 +12,16 @@ This enumeration is used in the [`dump`](dump.md) function to choose how to trea `basic_json` value. Three values are differentiated: strict -: throw a `type_error` exception in case of invalid UTF-8 +: throw a [`type_error`](../../home/exceptions.md#type-errors) exception in case of invalid UTF-8 replace : replace invalid UTF-8 sequences with U+FFFD (� REPLACEMENT CHARACTER) ignore -: ignore invalid UTF-8 sequences; all bytes are copied to the output unchanged +: ignore invalid UTF-8 sequences + +keep +: keep invalid UTF-8 sequences; all bytes are copied to the output unchanged ## Examples @@ -40,3 +43,4 @@ ignore ## Version history - Added in version 3.4.0. +- Added value `keep` in version ???. diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index 3509de5fc..2ced730d2 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -44,7 +44,8 @@ enum class error_handler_t { strict, ///< throw a type_error exception in case of invalid UTF-8 replace, ///< replace invalid UTF-8 sequences with U+FFFD - ignore ///< ignore invalid UTF-8 sequences + ignore, ///< ignore invalid UTF-8 sequences + keep ///< keep invalid UTF-8 sequences }; template @@ -398,6 +399,13 @@ class serializer std::size_t bytes_after_last_accept = 0; std::size_t undumped_chars = 0; + // copy string as-is if error handler is set to keep + if (error_handler == error_handler_t::keep) + { + o->write_characters(s.data(), s.size()); + return; + } + for (std::size_t i = 0; i < s.size(); ++i) { const auto byte = static_cast(s[i]); @@ -529,12 +537,6 @@ class serializer // thus removing/ignoring the invalid characters bytes = bytes_after_last_accept; - // fix for #4552 - if (error_handler == error_handler_t::ignore) - { - bytes += undumped_chars; - } - if (error_handler == error_handler_t::replace) { // add a replacement character diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index df71d15cf..c37bb6bdf 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -18606,7 +18606,8 @@ enum class error_handler_t { strict, ///< throw a type_error exception in case of invalid UTF-8 replace, ///< replace invalid UTF-8 sequences with U+FFFD - ignore ///< ignore invalid UTF-8 sequences + ignore, ///< ignore invalid UTF-8 sequences + keep ///< keep invalid UTF-8 sequences }; template @@ -18960,6 +18961,13 @@ class serializer std::size_t bytes_after_last_accept = 0; std::size_t undumped_chars = 0; + // copy string as-is if error handler is set to keep + if (error_handler == error_handler_t::keep) + { + o->write_characters(s.data(), s.size()); + return; + } + for (std::size_t i = 0; i < s.size(); ++i) { const auto byte = static_cast(s[i]); @@ -19091,12 +19099,6 @@ class serializer // thus removing/ignoring the invalid characters bytes = bytes_after_last_accept; - // fix for #4552 - if (error_handler == error_handler_t::ignore) - { - bytes += undumped_chars; - } - if (error_handler == error_handler_t::replace) { // add a replacement character diff --git a/tests/src/unit-regression2.cpp b/tests/src/unit-regression2.cpp index dbea75bb7..b8c8b782f 100644 --- a/tests/src/unit-regression2.cpp +++ b/tests/src/unit-regression2.cpp @@ -1000,8 +1000,8 @@ TEST_CASE("regression tests 2") { nlohmann::json node; node["test"] = "test\334\005"; - const auto test_dump = node.dump(-1, ' ', false, nlohmann::json::error_handler_t::ignore); - CHECK(test_dump == "{\"test\":\"test\334\\u0005\"}"); + const auto test_dump = node.dump(-1, ' ', false, nlohmann::json::error_handler_t::keep); + CHECK(test_dump == "{\"test\":\"test\334\005\"}"); } } diff --git a/tests/src/unit-serialization.cpp b/tests/src/unit-serialization.cpp index bb7f1bfaf..5c3a05240 100644 --- a/tests/src/unit-serialization.cpp +++ b/tests/src/unit-serialization.cpp @@ -109,9 +109,8 @@ TEST_CASE("serialization") CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); // see pending discussion at #4452 - // CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\""); - CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\xF1\xB0\x34\x35\x36\""); - + CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\""); CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\""); } diff --git a/tests/src/unit-unicode2.cpp b/tests/src/unit-unicode2.cpp index 606de12e2..eb0a5a3fd 100644 --- a/tests/src/unit-unicode2.cpp +++ b/tests/src/unit-unicode2.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); diff --git a/tests/src/unit-unicode3.cpp b/tests/src/unit-unicode3.cpp index b060f090a..db64a252d 100644 --- a/tests/src/unit-unicode3.cpp +++ b/tests/src/unit-unicode3.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); diff --git a/tests/src/unit-unicode4.cpp b/tests/src/unit-unicode4.cpp index a6a67a029..c5d0b7034 100644 --- a/tests/src/unit-unicode4.cpp +++ b/tests/src/unit-unicode4.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); diff --git a/tests/src/unit-unicode5.cpp b/tests/src/unit-unicode5.cpp index 7cf2fc8c0..d2a70ea0c 100644 --- a/tests/src/unit-unicode5.cpp +++ b/tests/src/unit-unicode5.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");