🚧 first implementation for keep

This commit is contained in:
Niels Lohmann 2024-12-20 15:40:20 +01:00
parent a27a5b5442
commit a2d828c204
No known key found for this signature in database
GPG Key ID: 7F3CEA63AE251B69
10 changed files with 54 additions and 25 deletions

View File

@ -25,10 +25,11 @@ and `ensure_ascii` parameters.
result consists of ASCII characters only.
`error_handler` (in)
: how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md):
`strict` (throws and exception in case a decoding error occurs; default), `replace` (replace invalid UTF-8 sequences
with U+FFFD), and `ignore` (ignore invalid UTF-8 sequences during serialization; all bytes are copied to the output
unchanged)).
: how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md)):
: - `strict`: throws a [`type_error`](../../home/exceptions.md#type-errors) exception in case a decoding error occurs (this is the default),
- `replace`: replace invalid UTF-8 sequences with U+FFFD (<28> REPLACEMENT CHARACTER),
- `ignore`: ignore invalid UTF-8 sequences during serialization, and
- `keep`: keep invalid UTF-8 sequences during serialization; all bytes are copied to the output unchanged
## Return value
@ -77,3 +78,4 @@ Binary values are serialized as object containing two keys:
- Indentation character `indent_char`, option `ensure_ascii` and exceptions added in version 3.0.0.
- Error handlers added in version 3.4.0.
- Serialization of binary values added in version 3.8.0.
- Added support for error handler value `keep` in version ???.

View File

@ -12,13 +12,16 @@ This enumeration is used in the [`dump`](dump.md) function to choose how to trea
`basic_json` value. Three values are differentiated:
strict
: throw a `type_error` exception in case of invalid UTF-8
: throw a [`type_error`](../../home/exceptions.md#type-errors) exception in case of invalid UTF-8
replace
: replace invalid UTF-8 sequences with U+FFFD (<28> REPLACEMENT CHARACTER)
ignore
: ignore invalid UTF-8 sequences; all bytes are copied to the output unchanged
: ignore invalid UTF-8 sequences
keep
: keep invalid UTF-8 sequences; all bytes are copied to the output unchanged
## Examples
@ -40,3 +43,4 @@ ignore
## Version history
- Added in version 3.4.0.
- Added value `keep` in version ???.

View File

@ -44,7 +44,8 @@ enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
ignore, ///< ignore invalid UTF-8 sequences
keep ///< keep invalid UTF-8 sequences
};
template<typename BasicJsonType>
@ -398,6 +399,13 @@ class serializer
std::size_t bytes_after_last_accept = 0;
std::size_t undumped_chars = 0;
// copy string as-is if error handler is set to keep
if (error_handler == error_handler_t::keep)
{
o->write_characters(s.data(), s.size());
return;
}
for (std::size_t i = 0; i < s.size(); ++i)
{
const auto byte = static_cast<std::uint8_t>(s[i]);
@ -529,12 +537,6 @@ class serializer
// thus removing/ignoring the invalid characters
bytes = bytes_after_last_accept;
// fix for #4552
if (error_handler == error_handler_t::ignore)
{
bytes += undumped_chars;
}
if (error_handler == error_handler_t::replace)
{
// add a replacement character

View File

@ -18606,7 +18606,8 @@ enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
ignore, ///< ignore invalid UTF-8 sequences
keep ///< keep invalid UTF-8 sequences
};
template<typename BasicJsonType>
@ -18960,6 +18961,13 @@ class serializer
std::size_t bytes_after_last_accept = 0;
std::size_t undumped_chars = 0;
// copy string as-is if error handler is set to keep
if (error_handler == error_handler_t::keep)
{
o->write_characters(s.data(), s.size());
return;
}
for (std::size_t i = 0; i < s.size(); ++i)
{
const auto byte = static_cast<std::uint8_t>(s[i]);
@ -19091,12 +19099,6 @@ class serializer
// thus removing/ignoring the invalid characters
bytes = bytes_after_last_accept;
// fix for #4552
if (error_handler == error_handler_t::ignore)
{
bytes += undumped_chars;
}
if (error_handler == error_handler_t::replace)
{
// add a replacement character

View File

@ -1000,8 +1000,8 @@ TEST_CASE("regression tests 2")
{
nlohmann::json node;
node["test"] = "test\334\005";
const auto test_dump = node.dump(-1, ' ', false, nlohmann::json::error_handler_t::ignore);
CHECK(test_dump == "{\"test\":\"test\334\\u0005\"}");
const auto test_dump = node.dump(-1, ' ', false, nlohmann::json::error_handler_t::keep);
CHECK(test_dump == "{\"test\":\"test\334\005\"}");
}
}

View File

@ -109,9 +109,8 @@ TEST_CASE("serialization")
CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
// see pending discussion at #4452
// CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\xF1\xB0\x34\x35\x36\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\"");
CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\"");
}

View File

@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
static std::string s_replaced2;
static std::string s_replaced_ascii;
static std::string s_replaced2_ascii;
static std::string s_kept;
// dumping with ignore/replace must not throw in any case
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
if (success_expected)
{
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
}
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
CHECK(json_string == s_kept.substr(1, json_string.size()));
// check that prefix and suffix are preserved
CHECK(s_ignored2.substr(1, 3) == "abc");
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");

View File

@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
static std::string s_replaced2;
static std::string s_replaced_ascii;
static std::string s_replaced2_ascii;
static std::string s_kept;
// dumping with ignore/replace must not throw in any case
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
if (success_expected)
{
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
}
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
CHECK(json_string == s_kept.substr(1, json_string.size()));
// check that prefix and suffix are preserved
CHECK(s_ignored2.substr(1, 3) == "abc");
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");

View File

@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
static std::string s_replaced2;
static std::string s_replaced_ascii;
static std::string s_replaced2_ascii;
static std::string s_kept;
// dumping with ignore/replace must not throw in any case
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
if (success_expected)
{
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
}
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
CHECK(json_string == s_kept.substr(1, json_string.size()));
// check that prefix and suffix are preserved
CHECK(s_ignored2.substr(1, 3) == "abc");
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");

View File

@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
static std::string s_replaced2;
static std::string s_replaced_ascii;
static std::string s_replaced2_ascii;
static std::string s_kept;
// dumping with ignore/replace must not throw in any case
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
if (success_expected)
{
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
}
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
CHECK(json_string == s_kept.substr(1, json_string.size()));
// check that prefix and suffix are preserved
CHECK(s_ignored2.substr(1, 3) == "abc");
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");