From 3cd50255a8f850de60ffd0ea964f6f9642f30811 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sun, 22 Dec 2024 13:12:34 +0100 Subject: [PATCH] :construction: add support for ensure_ascii --- include/nlohmann/detail/output/serializer.hpp | 36 ++++++++++++++++--- single_include/nlohmann/json.hpp | 36 ++++++++++++++++--- tests/src/unit-serialization.cpp | 10 +++++- 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index 76bfa7a1c..aafa21a12 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -399,8 +399,8 @@ class serializer std::size_t bytes_after_last_accept = 0; std::size_t undumped_chars = 0; - // copy string as-is if error handler is set to keep - if (error_handler == error_handler_t::keep) + // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII + if (error_handler == error_handler_t::keep && !ensure_ascii) { o->write_characters(s.data(), s.size()); return; @@ -575,7 +575,22 @@ class serializer break; } - case error_handler_t::keep: // LCOV_EXCL_LINE + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (int j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + + // add erroneous byte to string buffer + string_buffer[bytes++] = s[i]; + + // continue processing the string + state = UTF8_ACCEPT; + break; + } + default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } @@ -614,6 +629,20 @@ class serializer JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast(s.back() | 0))), nullptr)); } + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (int j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + undumped_chars = 0; + + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes); + break; + } + case error_handler_t::ignore: { // write all accepted bytes @@ -637,7 +666,6 @@ class serializer break; } - case error_handler_t::keep: // LCOV_EXCL_LINE default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 53d6b0000..e8b80ad33 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -18961,8 +18961,8 @@ class serializer std::size_t bytes_after_last_accept = 0; std::size_t undumped_chars = 0; - // copy string as-is if error handler is set to keep - if (error_handler == error_handler_t::keep) + // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII + if (error_handler == error_handler_t::keep && !ensure_ascii) { o->write_characters(s.data(), s.size()); return; @@ -19137,7 +19137,22 @@ class serializer break; } - case error_handler_t::keep: // LCOV_EXCL_LINE + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (int j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + + // add erroneous byte to string buffer + string_buffer[bytes++] = s[i]; + + // continue processing the string + state = UTF8_ACCEPT; + break; + } + default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } @@ -19176,6 +19191,20 @@ class serializer JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast(s.back() | 0))), nullptr)); } + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (int j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + undumped_chars = 0; + + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes); + break; + } + case error_handler_t::ignore: { // write all accepted bytes @@ -19199,7 +19228,6 @@ class serializer break; } - case error_handler_t::keep: // LCOV_EXCL_LINE default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } diff --git a/tests/src/unit-serialization.cpp b/tests/src/unit-serialization.cpp index 5c3a05240..ee8c7b43e 100644 --- a/tests/src/unit-serialization.cpp +++ b/tests/src/unit-serialization.cpp @@ -86,8 +86,11 @@ TEST_CASE("serialization") CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&); CHECK_THROWS_WITH_AS(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&); CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"\\u00e4\\u00fc\""); CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"ä\xA9ü\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"\\u00e4\xA9\\u00fc\""); } SECTION("ending with incomplete character") @@ -97,8 +100,11 @@ TEST_CASE("serialization") CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2", json::type_error&); CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123\""); CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xC2\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xC2\""); } SECTION("unexpected character") @@ -110,9 +116,11 @@ TEST_CASE("serialization") // see pending discussion at #4452 CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\""); - CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123456\""); CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\""); } SECTION("U+FFFD Substitution of Maximal Subparts")