Compare commits
16 Commits
develop
...
issue4552-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4ab98c39c3 | ||
|
|
493d1e4467 | ||
|
|
b167096e82 | ||
|
|
e9876d9e46 | ||
|
|
15ff3701a1 | ||
|
|
3cd50255a8 | ||
|
|
a6a06b76e7 | ||
|
|
7d2a83b735 | ||
|
|
a2d828c204 | ||
|
|
a27a5b5442 | ||
|
|
9f73bc1b37 | ||
|
|
3db5cc4ba8 | ||
|
|
1a76a2c514 | ||
|
|
3665dabb00 | ||
|
|
851584e609 | ||
|
|
4d67e127aa |
@ -25,10 +25,11 @@ and `ensure_ascii` parameters.
|
||||
result consists of ASCII characters only.
|
||||
|
||||
`error_handler` (in)
|
||||
: how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md):
|
||||
`strict` (throws and exception in case a decoding error occurs; default), `replace` (replace invalid UTF-8 sequences
|
||||
with U+FFFD), and `ignore` (ignore invalid UTF-8 sequences during serialization; all bytes are copied to the output
|
||||
unchanged)).
|
||||
: how to react on decoding errors; there are four possible values (see [`error_handler_t`](error_handler_t.md)):
|
||||
: - `strict`: throws a [`type_error`](../../home/exceptions.md#type-errors) exception in case a decoding error occurs (this is the default),
|
||||
- `replace`: replace invalid UTF-8 sequences with U+FFFD (<28> REPLACEMENT CHARACTER),
|
||||
- `ignore`: ignore invalid UTF-8 sequences during serialization (i.e., these bytes are skipped and not copied to the output), and
|
||||
- `keep`: keep invalid UTF-8 sequences during serialization (i.e., all bytes are copied to the output unchanged)
|
||||
|
||||
## Return value
|
||||
|
||||
@ -77,3 +78,4 @@ Binary values are serialized as object containing two keys:
|
||||
- Indentation character `indent_char`, option `ensure_ascii` and exceptions added in version 3.0.0.
|
||||
- Error handlers added in version 3.4.0.
|
||||
- Serialization of binary values added in version 3.8.0.
|
||||
- Added support for error handler value `keep` in version ???.
|
||||
|
||||
@ -12,13 +12,16 @@ This enumeration is used in the [`dump`](dump.md) function to choose how to trea
|
||||
`basic_json` value. Three values are differentiated:
|
||||
|
||||
strict
|
||||
: throw a `type_error` exception in case of invalid UTF-8
|
||||
: throw a [`type_error`](../../home/exceptions.md#type-errors) exception in case of invalid UTF-8
|
||||
|
||||
replace
|
||||
: replace invalid UTF-8 sequences with U+FFFD (<28> REPLACEMENT CHARACTER)
|
||||
|
||||
ignore
|
||||
: ignore invalid UTF-8 sequences; all bytes are copied to the output unchanged
|
||||
: ignore invalid UTF-8 sequences; these bytes are skipped and not copied to the output
|
||||
|
||||
keep
|
||||
: keep invalid UTF-8 sequences; all bytes are copied to the output unchanged
|
||||
|
||||
## Examples
|
||||
|
||||
@ -40,3 +43,4 @@ ignore
|
||||
## Version history
|
||||
|
||||
- Added in version 3.4.0.
|
||||
- Added value `keep` in version ???.
|
||||
|
||||
@ -44,7 +44,8 @@ enum class error_handler_t
|
||||
{
|
||||
strict, ///< throw a type_error exception in case of invalid UTF-8
|
||||
replace, ///< replace invalid UTF-8 sequences with U+FFFD
|
||||
ignore ///< ignore invalid UTF-8 sequences
|
||||
ignore, ///< ignore invalid UTF-8 sequences
|
||||
keep ///< keep invalid UTF-8 sequences
|
||||
};
|
||||
|
||||
template<typename BasicJsonType>
|
||||
@ -398,6 +399,13 @@ class serializer
|
||||
std::size_t bytes_after_last_accept = 0;
|
||||
std::size_t undumped_chars = 0;
|
||||
|
||||
// copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
|
||||
if (error_handler == error_handler_t::keep && !ensure_ascii)
|
||||
{
|
||||
o->write_characters(s.data(), s.size());
|
||||
return;
|
||||
}
|
||||
|
||||
for (std::size_t i = 0; i < s.size(); ++i)
|
||||
{
|
||||
const auto byte = static_cast<std::uint8_t>(s[i]);
|
||||
@ -567,6 +575,22 @@ class serializer
|
||||
break;
|
||||
}
|
||||
|
||||
case error_handler_t::keep:
|
||||
{
|
||||
// copy undumped chars to string buffer
|
||||
for (std::size_t j = 0; j < undumped_chars; ++j)
|
||||
{
|
||||
string_buffer[bytes++] = s[bytes_after_last_accept + j];
|
||||
}
|
||||
|
||||
// add erroneous byte to string buffer
|
||||
string_buffer[bytes++] = s[i];
|
||||
|
||||
// continue processing the string
|
||||
state = UTF8_ACCEPT;
|
||||
break;
|
||||
}
|
||||
|
||||
default: // LCOV_EXCL_LINE
|
||||
JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
|
||||
}
|
||||
@ -605,6 +629,20 @@ class serializer
|
||||
JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
|
||||
}
|
||||
|
||||
case error_handler_t::keep:
|
||||
{
|
||||
// copy undumped chars to string buffer
|
||||
for (std::size_t j = 0; j < undumped_chars; ++j)
|
||||
{
|
||||
string_buffer[bytes++] = s[bytes_after_last_accept + j];
|
||||
}
|
||||
undumped_chars = 0;
|
||||
|
||||
// write all accepted bytes
|
||||
o->write_characters(string_buffer.data(), bytes);
|
||||
break;
|
||||
}
|
||||
|
||||
case error_handler_t::ignore:
|
||||
{
|
||||
// write all accepted bytes
|
||||
|
||||
@ -18606,7 +18606,8 @@ enum class error_handler_t
|
||||
{
|
||||
strict, ///< throw a type_error exception in case of invalid UTF-8
|
||||
replace, ///< replace invalid UTF-8 sequences with U+FFFD
|
||||
ignore ///< ignore invalid UTF-8 sequences
|
||||
ignore, ///< ignore invalid UTF-8 sequences
|
||||
keep ///< keep invalid UTF-8 sequences
|
||||
};
|
||||
|
||||
template<typename BasicJsonType>
|
||||
@ -18960,6 +18961,13 @@ class serializer
|
||||
std::size_t bytes_after_last_accept = 0;
|
||||
std::size_t undumped_chars = 0;
|
||||
|
||||
// copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
|
||||
if (error_handler == error_handler_t::keep && !ensure_ascii)
|
||||
{
|
||||
o->write_characters(s.data(), s.size());
|
||||
return;
|
||||
}
|
||||
|
||||
for (std::size_t i = 0; i < s.size(); ++i)
|
||||
{
|
||||
const auto byte = static_cast<std::uint8_t>(s[i]);
|
||||
@ -19129,6 +19137,22 @@ class serializer
|
||||
break;
|
||||
}
|
||||
|
||||
case error_handler_t::keep:
|
||||
{
|
||||
// copy undumped chars to string buffer
|
||||
for (std::size_t j = 0; j < undumped_chars; ++j)
|
||||
{
|
||||
string_buffer[bytes++] = s[bytes_after_last_accept + j];
|
||||
}
|
||||
|
||||
// add erroneous byte to string buffer
|
||||
string_buffer[bytes++] = s[i];
|
||||
|
||||
// continue processing the string
|
||||
state = UTF8_ACCEPT;
|
||||
break;
|
||||
}
|
||||
|
||||
default: // LCOV_EXCL_LINE
|
||||
JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
|
||||
}
|
||||
@ -19167,6 +19191,20 @@ class serializer
|
||||
JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
|
||||
}
|
||||
|
||||
case error_handler_t::keep:
|
||||
{
|
||||
// copy undumped chars to string buffer
|
||||
for (std::size_t j = 0; j < undumped_chars; ++j)
|
||||
{
|
||||
string_buffer[bytes++] = s[bytes_after_last_accept + j];
|
||||
}
|
||||
undumped_chars = 0;
|
||||
|
||||
// write all accepted bytes
|
||||
o->write_characters(string_buffer.data(), bytes);
|
||||
break;
|
||||
}
|
||||
|
||||
case error_handler_t::ignore:
|
||||
{
|
||||
// write all accepted bytes
|
||||
|
||||
@ -995,6 +995,15 @@ TEST_CASE("regression tests 2")
|
||||
CHECK(p.x == 1);
|
||||
CHECK(p.y == 2);
|
||||
}
|
||||
|
||||
SECTION("issue #4552 - UTF-8 invalid characters are not always ignored when dumping with error_handler_t::ignore")
|
||||
{
|
||||
nlohmann::json node;
|
||||
node["test"] = "test\334\005";
|
||||
CHECK(node.dump(-1, ' ', false, nlohmann::json::error_handler_t::ignore) == "{\"test\":\"test\\u0005\"}");
|
||||
CHECK(node.dump(-1, ' ', false, nlohmann::json::error_handler_t::keep) == "{\"test\":\"test\334\005\"}");
|
||||
CHECK(node.dump(-1, ' ', true, nlohmann::json::error_handler_t::keep) == "{\"test\":\"test\334\005\"}");
|
||||
}
|
||||
}
|
||||
|
||||
DOCTEST_CLANG_SUPPRESS_WARNING_POP
|
||||
|
||||
@ -86,8 +86,11 @@ TEST_CASE("serialization")
|
||||
CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
|
||||
CHECK_THROWS_WITH_AS(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"\\u00e4\\u00fc\"");
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\"");
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"ä\xA9ü\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"\\u00e4\xA9\\u00fc\"");
|
||||
}
|
||||
|
||||
SECTION("ending with incomplete character")
|
||||
@ -97,8 +100,11 @@ TEST_CASE("serialization")
|
||||
CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2", json::type_error&);
|
||||
CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123\"");
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\"");
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xC2\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xC2\"");
|
||||
}
|
||||
|
||||
SECTION("unexpected character")
|
||||
@ -107,9 +113,14 @@ TEST_CASE("serialization")
|
||||
|
||||
CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34", json::type_error&);
|
||||
CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
|
||||
|
||||
// see pending discussion at #4452
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123456\"");
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\"");
|
||||
CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
|
||||
CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
|
||||
}
|
||||
|
||||
SECTION("U+FFFD Substitution of Maximal Subparts")
|
||||
|
||||
@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
static std::string s_replaced2;
|
||||
static std::string s_replaced_ascii;
|
||||
static std::string s_replaced2_ascii;
|
||||
static std::string s_kept;
|
||||
|
||||
// dumping with ignore/replace must not throw in any case
|
||||
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
|
||||
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
|
||||
|
||||
if (success_expected)
|
||||
{
|
||||
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
|
||||
}
|
||||
|
||||
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
|
||||
CHECK(json_string == s_kept.substr(1, json_string.size()));
|
||||
|
||||
// check that prefix and suffix are preserved
|
||||
CHECK(s_ignored2.substr(1, 3) == "abc");
|
||||
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
|
||||
|
||||
@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
static std::string s_replaced2;
|
||||
static std::string s_replaced_ascii;
|
||||
static std::string s_replaced2_ascii;
|
||||
static std::string s_kept;
|
||||
|
||||
// dumping with ignore/replace must not throw in any case
|
||||
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
|
||||
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
|
||||
|
||||
if (success_expected)
|
||||
{
|
||||
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
|
||||
}
|
||||
|
||||
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
|
||||
CHECK(json_string == s_kept.substr(1, json_string.size()));
|
||||
|
||||
// check that prefix and suffix are preserved
|
||||
CHECK(s_ignored2.substr(1, 3) == "abc");
|
||||
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
|
||||
|
||||
@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
static std::string s_replaced2;
|
||||
static std::string s_replaced_ascii;
|
||||
static std::string s_replaced2_ascii;
|
||||
static std::string s_kept;
|
||||
|
||||
// dumping with ignore/replace must not throw in any case
|
||||
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
|
||||
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
|
||||
|
||||
if (success_expected)
|
||||
{
|
||||
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
|
||||
}
|
||||
|
||||
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
|
||||
CHECK(json_string == s_kept.substr(1, json_string.size()));
|
||||
|
||||
// check that prefix and suffix are preserved
|
||||
CHECK(s_ignored2.substr(1, 3) == "abc");
|
||||
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
|
||||
|
||||
@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
static std::string s_replaced2;
|
||||
static std::string s_replaced_ascii;
|
||||
static std::string s_replaced2_ascii;
|
||||
static std::string s_kept;
|
||||
|
||||
// dumping with ignore/replace must not throw in any case
|
||||
s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
|
||||
@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
|
||||
s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
|
||||
|
||||
if (success_expected)
|
||||
{
|
||||
@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
|
||||
CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
|
||||
}
|
||||
|
||||
// check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
|
||||
CHECK(json_string == s_kept.substr(1, json_string.size()));
|
||||
|
||||
// check that prefix and suffix are preserved
|
||||
CHECK(s_ignored2.substr(1, 3) == "abc");
|
||||
CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
|
||||
|
||||
Loading…
Reference in New Issue
Block a user