Add support for JSON-compatible string escapes
For completeness I've implemented escaping for characters outside the basic multilingual plane, but it doesn't get used (as there's no EscapeAsAsciiJson emitter option implemented).
This commit is contained in:
parent
370aceeaf8
commit
1618b6d255
@ -19,6 +19,7 @@ enum EMITTER_MANIP {
|
||||
// output character set
|
||||
EmitNonAscii,
|
||||
EscapeNonAscii,
|
||||
EscapeAsJson,
|
||||
|
||||
// string manipulators
|
||||
// Auto, // duplicate
|
||||
|
||||
@ -686,14 +686,27 @@ void Emitter::StartedScalar() { m_pState->StartedScalar(); }
|
||||
// *******************************************************************************************
|
||||
// overloads of Write
|
||||
|
||||
StringEscaping::value GetStringEscapingStyle(const EMITTER_MANIP emitterManip) {
|
||||
switch (emitterManip) {
|
||||
case EscapeNonAscii:
|
||||
return StringEscaping::NonAscii;
|
||||
case EscapeAsJson:
|
||||
return StringEscaping::JSON;
|
||||
default:
|
||||
return StringEscaping::None;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Emitter& Emitter::Write(const std::string& str) {
|
||||
if (!good())
|
||||
return *this;
|
||||
|
||||
const bool escapeNonAscii = m_pState->GetOutputCharset() == EscapeNonAscii;
|
||||
StringEscaping::value stringEscaping = GetStringEscapingStyle(m_pState->GetOutputCharset());
|
||||
|
||||
const StringFormat::value strFormat =
|
||||
Utils::ComputeStringFormat(str, m_pState->GetStringFormat(),
|
||||
m_pState->CurGroupFlowType(), escapeNonAscii);
|
||||
m_pState->CurGroupFlowType(), stringEscaping == StringEscaping::NonAscii);
|
||||
|
||||
if (strFormat == StringFormat::Literal)
|
||||
m_pState->SetMapKeyFormat(YAML::LongKey, FmtScope::Local);
|
||||
@ -708,7 +721,7 @@ Emitter& Emitter::Write(const std::string& str) {
|
||||
Utils::WriteSingleQuotedString(m_stream, str);
|
||||
break;
|
||||
case StringFormat::DoubleQuoted:
|
||||
Utils::WriteDoubleQuotedString(m_stream, str, escapeNonAscii);
|
||||
Utils::WriteDoubleQuotedString(m_stream, str, stringEscaping);
|
||||
break;
|
||||
case StringFormat::Literal:
|
||||
Utils::WriteLiteralString(m_stream, str,
|
||||
@ -814,8 +827,10 @@ Emitter& Emitter::Write(char ch) {
|
||||
if (!good())
|
||||
return *this;
|
||||
|
||||
|
||||
|
||||
PrepareNode(EmitterNodeType::Scalar);
|
||||
Utils::WriteChar(m_stream, ch);
|
||||
Utils::WriteChar(m_stream, ch, GetStringEscapingStyle(m_pState->GetOutputCharset()));
|
||||
StartedScalar();
|
||||
|
||||
return *this;
|
||||
|
||||
@ -231,6 +231,7 @@ bool EmitterState::SetOutputCharset(EMITTER_MANIP value,
|
||||
switch (value) {
|
||||
case EmitNonAscii:
|
||||
case EscapeNonAscii:
|
||||
case EscapeAsJson:
|
||||
_Set(m_charset, value, scope);
|
||||
return true;
|
||||
default:
|
||||
|
||||
@ -218,20 +218,34 @@ bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
|
||||
});
|
||||
}
|
||||
|
||||
void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint) {
|
||||
std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
|
||||
const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
|
||||
|
||||
return {
|
||||
leadOffset | (codePoint >> 10),
|
||||
0xDC00 | (codePoint & 0x3FF),
|
||||
};
|
||||
}
|
||||
|
||||
void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
|
||||
static const char hexDigits[] = "0123456789abcdef";
|
||||
|
||||
out << "\\";
|
||||
int digits = 8;
|
||||
if (codePoint < 0xFF) {
|
||||
if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
|
||||
out << "x";
|
||||
digits = 2;
|
||||
} else if (codePoint < 0xFFFF) {
|
||||
out << "u";
|
||||
digits = 4;
|
||||
} else {
|
||||
} else if (stringEscapingStyle != StringEscaping::JSON) {
|
||||
out << "U";
|
||||
digits = 8;
|
||||
} else {
|
||||
auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
|
||||
WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
|
||||
WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
|
||||
return;
|
||||
}
|
||||
|
||||
// Write digits into the escape sequence
|
||||
@ -303,7 +317,7 @@ bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
|
||||
}
|
||||
|
||||
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
|
||||
bool escapeNonAscii) {
|
||||
StringEscaping::value stringEscaping) {
|
||||
out << "\"";
|
||||
int codePoint;
|
||||
for (std::string::const_iterator i = str.begin();
|
||||
@ -327,16 +341,19 @@ bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
|
||||
case '\b':
|
||||
out << "\\b";
|
||||
break;
|
||||
case '\f':
|
||||
out << "\\f";
|
||||
break;
|
||||
default:
|
||||
if (codePoint < 0x20 ||
|
||||
(codePoint >= 0x80 &&
|
||||
codePoint <= 0xA0)) { // Control characters and non-breaking space
|
||||
WriteDoubleQuoteEscapeSequence(out, codePoint);
|
||||
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
|
||||
} else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
|
||||
// escaped (YAML 1.2, sec. 5.2)
|
||||
WriteDoubleQuoteEscapeSequence(out, codePoint);
|
||||
} else if (escapeNonAscii && codePoint > 0x7E) {
|
||||
WriteDoubleQuoteEscapeSequence(out, codePoint);
|
||||
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
|
||||
} else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
|
||||
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
|
||||
} else {
|
||||
WriteCodePoint(out, codePoint);
|
||||
}
|
||||
@ -362,7 +379,7 @@ bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool WriteChar(ostream_wrapper& out, char ch) {
|
||||
bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
|
||||
if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
|
||||
out << ch;
|
||||
} else if (ch == '\"') {
|
||||
@ -373,13 +390,17 @@ bool WriteChar(ostream_wrapper& out, char ch) {
|
||||
out << R"("\n")";
|
||||
} else if (ch == '\b') {
|
||||
out << R"("\b")";
|
||||
} else if (ch == '\r') {
|
||||
out << R"("\r")";
|
||||
} else if (ch == '\f') {
|
||||
out << R"("\f")";
|
||||
} else if (ch == '\\') {
|
||||
out << R"("\\")";
|
||||
} else if (0x20 <= ch && ch <= 0x7e) {
|
||||
out << "\"" << ch << "\"";
|
||||
} else {
|
||||
out << "\"";
|
||||
WriteDoubleQuoteEscapeSequence(out, ch);
|
||||
WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
|
||||
out << "\"";
|
||||
}
|
||||
return true;
|
||||
@ -469,7 +490,7 @@ bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
|
||||
|
||||
bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
|
||||
WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
|
||||
false);
|
||||
StringEscaping::None);
|
||||
return true;
|
||||
}
|
||||
} // namespace Utils
|
||||
|
||||
@ -24,6 +24,10 @@ struct StringFormat {
|
||||
enum value { Plain, SingleQuoted, DoubleQuoted, Literal };
|
||||
};
|
||||
|
||||
struct StringEscaping {
|
||||
enum value { None, NonAscii, JSON };
|
||||
};
|
||||
|
||||
namespace Utils {
|
||||
StringFormat::value ComputeStringFormat(const std::string& str,
|
||||
EMITTER_MANIP strFormat,
|
||||
@ -32,10 +36,11 @@ StringFormat::value ComputeStringFormat(const std::string& str,
|
||||
|
||||
bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str);
|
||||
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
|
||||
bool escapeNonAscii);
|
||||
StringEscaping::value stringEscaping);
|
||||
bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
|
||||
std::size_t indent);
|
||||
bool WriteChar(ostream_wrapper& out, char ch);
|
||||
bool WriteChar(ostream_wrapper& out, char ch,
|
||||
StringEscaping::value stringEscapingStyle);
|
||||
bool WriteComment(ostream_wrapper& out, const std::string& str,
|
||||
std::size_t postCommentIndent);
|
||||
bool WriteAlias(ostream_wrapper& out, const std::string& str);
|
||||
|
||||
@ -813,7 +813,43 @@ TEST_F(EmitterTest, Unicode) {
|
||||
|
||||
TEST_F(EmitterTest, DoubleQuotedUnicode) {
|
||||
out << DoubleQuoted << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2";
|
||||
ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
|
||||
ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
|
||||
}
|
||||
|
||||
TEST_F(EmitterTest, EscapedJsonString) {
|
||||
out.SetStringFormat(DoubleQuoted);
|
||||
out.SetOutputCharset(EscapeAsJson);
|
||||
out << "\" \\ "
|
||||
"\x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0A \x0B \x0C \x0D \x0E \x0F "
|
||||
"\x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A \x1B \x1C \x1D \x1E \x1F "
|
||||
"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2";
|
||||
|
||||
ExpectEmit(R"("\" \\ \u0001 \u0002 \u0003 \u0004 \u0005 \u0006 \u0007 \b \t )"
|
||||
R"(\n \u000b \f \r \u000e \u000f \u0010 \u0011 \u0012 \u0013 )"
|
||||
R"(\u0014 \u0015 \u0016 \u0017 \u0018 \u0019 \u001a \u001b )"
|
||||
R"(\u001c \u001d \u001e \u001f )"
|
||||
"$ \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
|
||||
}
|
||||
|
||||
TEST_F(EmitterTest, EscapedCharacters) {
|
||||
out << BeginSeq
|
||||
<< '\x00'
|
||||
<< '\x0C'
|
||||
<< '\x0D'
|
||||
<< EndSeq;
|
||||
|
||||
ExpectEmit("- \"\\x00\"\n- \"\\f\"\n- \"\\r\"");
|
||||
}
|
||||
|
||||
TEST_F(EmitterTest, CharactersEscapedAsJson) {
|
||||
out.SetOutputCharset(EscapeAsJson);
|
||||
out << BeginSeq
|
||||
<< '\x00'
|
||||
<< '\x0C'
|
||||
<< '\x0D'
|
||||
<< EndSeq;
|
||||
|
||||
ExpectEmit("- \"\\u0000\"\n- \"\\f\"\n- \"\\r\"");
|
||||
}
|
||||
|
||||
TEST_F(EmitterTest, DoubleQuotedString) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user