From 3cd50255a8f850de60ffd0ea964f6f9642f30811 Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Sun, 22 Dec 2024 13:12:34 +0100
Subject: [PATCH] :construction: add support for ensure_ascii

---
 include/nlohmann/detail/output/serializer.hpp | 36 ++++++++++++++++---
 single_include/nlohmann/json.hpp              | 36 ++++++++++++++++---
 tests/src/unit-serialization.cpp              | 10 +++++-
 3 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp
index 76bfa7a1c..aafa21a12 100644
--- a/include/nlohmann/detail/output/serializer.hpp
+++ b/include/nlohmann/detail/output/serializer.hpp
@@ -399,8 +399,8 @@ class serializer
         std::size_t bytes_after_last_accept = 0;
         std::size_t undumped_chars = 0;
 
-        // copy string as-is if error handler is set to keep
-        if (error_handler == error_handler_t::keep)
+        // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
+        if (error_handler == error_handler_t::keep && !ensure_ascii)
         {
             o->write_characters(s.data(), s.size());
             return;
@@ -575,7 +575,22 @@ class serializer
                             break;
                         }
 
-                        case error_handler_t::keep: // LCOV_EXCL_LINE
+                        case error_handler_t::keep:
+                        {
+                            // copy undumped chars to string buffer
+                            for (int j = 0; j < undumped_chars; ++j)
+                            {
+                                string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                            }
+
+                            // add erroneous byte to string buffer
+                            string_buffer[bytes++] = s[i];
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
                         default:                    // LCOV_EXCL_LINE
                             JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                     }
@@ -614,6 +629,20 @@ class serializer
                     JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
                 }
 
+                case error_handler_t::keep:
+                {
+                    // copy undumped chars to string buffer
+                    for (int j = 0; j < undumped_chars; ++j)
+                    {
+                        string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                    }
+                    undumped_chars = 0;
+
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes);
+                    break;
+                }
+
                 case error_handler_t::ignore:
                 {
                     // write all accepted bytes
@@ -637,7 +666,6 @@ class serializer
                     break;
                 }
 
-                case error_handler_t::keep: // LCOV_EXCL_LINE
                 default:                    // LCOV_EXCL_LINE
                     JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
             }
diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp
index 53d6b0000..e8b80ad33 100644
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@@ -18961,8 +18961,8 @@ class serializer
         std::size_t bytes_after_last_accept = 0;
         std::size_t undumped_chars = 0;
 
-        // copy string as-is if error handler is set to keep
-        if (error_handler == error_handler_t::keep)
+        // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
+        if (error_handler == error_handler_t::keep && !ensure_ascii)
         {
             o->write_characters(s.data(), s.size());
             return;
@@ -19137,7 +19137,22 @@ class serializer
                             break;
                         }
 
-                        case error_handler_t::keep: // LCOV_EXCL_LINE
+                        case error_handler_t::keep:
+                        {
+                            // copy undumped chars to string buffer
+                            for (int j = 0; j < undumped_chars; ++j)
+                            {
+                                string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                            }
+
+                            // add erroneous byte to string buffer
+                            string_buffer[bytes++] = s[i];
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
                         default:                    // LCOV_EXCL_LINE
                             JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                     }
@@ -19176,6 +19191,20 @@ class serializer
                     JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
                 }
 
+                case error_handler_t::keep:
+                {
+                    // copy undumped chars to string buffer
+                    for (int j = 0; j < undumped_chars; ++j)
+                    {
+                        string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                    }
+                    undumped_chars = 0;
+
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes);
+                    break;
+                }
+
                 case error_handler_t::ignore:
                 {
                     // write all accepted bytes
@@ -19199,7 +19228,6 @@ class serializer
                     break;
                 }
 
-                case error_handler_t::keep: // LCOV_EXCL_LINE
                 default:                    // LCOV_EXCL_LINE
                     JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
             }
diff --git a/tests/src/unit-serialization.cpp b/tests/src/unit-serialization.cpp
index 5c3a05240..ee8c7b43e 100644
--- a/tests/src/unit-serialization.cpp
+++ b/tests/src/unit-serialization.cpp
@@ -86,8 +86,11 @@ TEST_CASE("serialization")
             CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
             CHECK_THROWS_WITH_AS(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"\\u00e4\\u00fc\"");
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\"");
             CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"ä\xA9ü\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"\\u00e4\xA9\\u00fc\"");
         }
 
         SECTION("ending with incomplete character")
@@ -97,8 +100,11 @@ TEST_CASE("serialization")
             CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2", json::type_error&);
             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123\"");
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\"");
             CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xC2\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xC2\"");
         }
 
         SECTION("unexpected character")
@@ -110,9 +116,11 @@ TEST_CASE("serialization")
 
             // see pending discussion at #4452
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
-            CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123456\"");
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\"");
             CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
         }
 
         SECTION("U+FFFD Substitution of Maximal Subparts")