🚧 add support for ensure_ascii

This commit is contained in:
Niels Lohmann 2024-12-22 13:12:34 +01:00
parent a6a06b76e7
commit 3cd50255a8
No known key found for this signature in database
GPG Key ID: 7F3CEA63AE251B69
3 changed files with 73 additions and 9 deletions

View File

@ -399,8 +399,8 @@ class serializer
std::size_t bytes_after_last_accept = 0;
std::size_t undumped_chars = 0;
// copy string as-is if error handler is set to keep
if (error_handler == error_handler_t::keep)
// copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
if (error_handler == error_handler_t::keep && !ensure_ascii)
{
o->write_characters(s.data(), s.size());
return;
@ -575,7 +575,22 @@ class serializer
break;
}
case error_handler_t::keep: // LCOV_EXCL_LINE
case error_handler_t::keep:
{
// copy undumped chars to string buffer
for (int j = 0; j < undumped_chars; ++j)
{
string_buffer[bytes++] = s[bytes_after_last_accept + j];
}
// add erroneous byte to string buffer
string_buffer[bytes++] = s[i];
// continue processing the string
state = UTF8_ACCEPT;
break;
}
default: // LCOV_EXCL_LINE
JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
}
@ -614,6 +629,20 @@ class serializer
JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
}
case error_handler_t::keep:
{
// copy undumped chars to string buffer
for (int j = 0; j < undumped_chars; ++j)
{
string_buffer[bytes++] = s[bytes_after_last_accept + j];
}
undumped_chars = 0;
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes);
break;
}
case error_handler_t::ignore:
{
// write all accepted bytes
@ -637,7 +666,6 @@ class serializer
break;
}
case error_handler_t::keep: // LCOV_EXCL_LINE
default: // LCOV_EXCL_LINE
JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
}

View File

@ -18961,8 +18961,8 @@ class serializer
std::size_t bytes_after_last_accept = 0;
std::size_t undumped_chars = 0;
// copy string as-is if error handler is set to keep
if (error_handler == error_handler_t::keep)
// copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
if (error_handler == error_handler_t::keep && !ensure_ascii)
{
o->write_characters(s.data(), s.size());
return;
@ -19137,7 +19137,22 @@ class serializer
break;
}
case error_handler_t::keep: // LCOV_EXCL_LINE
case error_handler_t::keep:
{
// copy undumped chars to string buffer
for (int j = 0; j < undumped_chars; ++j)
{
string_buffer[bytes++] = s[bytes_after_last_accept + j];
}
// add erroneous byte to string buffer
string_buffer[bytes++] = s[i];
// continue processing the string
state = UTF8_ACCEPT;
break;
}
default: // LCOV_EXCL_LINE
JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
}
@ -19176,6 +19191,20 @@ class serializer
JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
}
case error_handler_t::keep:
{
// copy undumped chars to string buffer
for (int j = 0; j < undumped_chars; ++j)
{
string_buffer[bytes++] = s[bytes_after_last_accept + j];
}
undumped_chars = 0;
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes);
break;
}
case error_handler_t::ignore:
{
// write all accepted bytes
@ -19199,7 +19228,6 @@ class serializer
break;
}
case error_handler_t::keep: // LCOV_EXCL_LINE
default: // LCOV_EXCL_LINE
JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
}

View File

@ -86,8 +86,11 @@ TEST_CASE("serialization")
CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
CHECK_THROWS_WITH_AS(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"\\u00e4\\u00fc\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"ä\xA9ü\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"\\u00e4\xA9\\u00fc\"");
}
SECTION("ending with incomplete character")
@ -97,8 +100,11 @@ TEST_CASE("serialization")
CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2", json::type_error&);
CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xC2\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xC2\"");
}
SECTION("unexpected character")
@ -110,9 +116,11 @@ TEST_CASE("serialization")
// see pending discussion at #4452
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123456\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
}
SECTION("U+FFFD Substitution of Maximal Subparts")