add support for some built-in string format specifiers: date-time, date, time, ipv4, ipv6

This commit is contained in:
mxmlnkn 2019-05-02 17:05:24 +02:00 committed by Patrick Boettcher
parent 64461f5b79
commit ad8f158284
10 changed files with 433 additions and 36 deletions

View File

@ -37,7 +37,8 @@ endif()
add_library(json-schema-validator
src/json-schema-draft7.json.cpp
src/json-uri.cpp
src/json-validator.cpp)
src/json-validator.cpp
src/string-format-check.cpp)
set_target_properties(json-schema-validator
PROPERTIES

View File

@ -212,9 +212,26 @@ cmake-variable `JSON_SCHEMA_TEST_SUITE_PATH` will enable the test-target(s).
All required tests are **OK**.
# Format
Optionally JSON-schema-validator can validation predefined or user-defined formats.
Therefore a format-checker-function can be provided by the user which is called by
the validator when a format-check is required.
The library contains a default-checker, which does some checks. It needs to be
provided manually to the constructor of the validator:
```C++
json_validator validator(loader,
nlohmann::json_schema::default_string_format_check);
```
# Contributing
Before opening a pull request, please apply the coding style given in the `.clang-format` by running clang-format from the git top-level for all touched files:
Before opening a pull request, please apply the coding style given in the
`.clang-format` by running clang-format from the git top-level for all touched
files:
```shell
git diff master --name-only | grep '\.[ch]pp$' | xargs -P 3 -I{} clang-format -i {}
```

View File

@ -64,7 +64,8 @@ int main(int argc, char *argv[])
}
// 2) create the validator and
json_validator validator(loader, [](const std::string &, const std::string &) {});
json_validator validator(loader,
nlohmann::json_schema::default_string_format_check);
try {
// insert this schema as the root to the validator

View File

@ -156,6 +156,11 @@ public:
operator bool() const { return error_; }
};
/**
* Checks validity of JSON schema built-in string format specifiers like 'date-time', 'ipv4', ...
*/
void default_string_format_check(const std::string &format, const std::string &value);
class root_schema;
class JSON_SCHEMA_VALIDATOR_API json_validator

View File

@ -1043,6 +1043,7 @@ std::shared_ptr<schema> type_schema::make(json &schema,
return nullptr;
}
} // namespace
namespace
{

311
src/string-format-check.cpp Normal file
View File

@ -0,0 +1,311 @@
#include <json-schema.hpp>
#include <algorithm>
#include <exception>
#include <iostream>
#include <regex>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
/**
* Many of the RegExes are from @see http://jmrware.com/articles/2009/uri_regexp/URI_regex.html
*/
namespace
{
template <typename T>
void range_check(const T value, const T min, const T max)
{
if (!((value >= min) && (value <= max))) {
std::stringstream out;
out << "Value " << value << " should be in interval [" << min << "," << max << "] but is not!";
throw std::invalid_argument(out.str());
}
}
/** @see date_time_check */
void rfc3339_date_check(const std::string &value)
{
const static std::regex dateRegex{R"(^([0-9]{4})\-([0-9]{2})\-([0-9]{2})$)"};
std::smatch matches;
if (!std::regex_match(value, matches, dateRegex)) {
throw std::invalid_argument(value + " is not a date string according to RFC 3339.");
}
const auto year = std::stoi(matches[1].str());
const auto month = std::stoi(matches[2].str());
const auto mday = std::stoi(matches[3].str());
const auto isLeapYear = (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0));
range_check(month, 1, 12);
if (month == 2) {
range_check(mday, 1, isLeapYear ? 29 : 28);
} else if (month <= 7) {
range_check(mday, 1, month % 2 == 0 ? 30 : 31);
} else {
range_check(mday, 1, month % 2 == 0 ? 31 : 30);
}
}
/** @see date_time_check */
void rfc3339_time_check(const std::string &value)
{
const static std::regex timeRegex{R"(^([0-9]{2})\:([0-9]{2})\:([0-9]{2})(\.[0-9]+)?(?:[Zz]|((?:\+|\-)[0-9]{2})\:([0-9]{2}))$)"};
std::smatch matches;
if (!std::regex_match(value, matches, timeRegex)) {
throw std::invalid_argument(value + " is not a time string according to RFC 3339.");
}
const auto hour = std::stoi(matches[1].str());
const auto minute = std::stoi(matches[2].str());
const auto second = std::stoi(matches[3].str());
// const auto secfrac = std::stof( matches[4].str() );
range_check(hour, 0, 23);
range_check(minute, 0, 59);
/**
* @todo Could be made more exact by querying a leap second database and choosing the
* correct maximum in {58,59,60}. This current solution might match some invalid dates
* but it won't lead to false negatives. This only works if we know the full date, however
*/
range_check(second, 0, 60);
/* don't check the numerical offset if time zone is specified as 'Z' */
if (!matches[5].str().empty()) {
const auto offsetHour = std::stoi(matches[5].str());
const auto offsetMinute = std::stoi(matches[6].str());
range_check(offsetHour, -23, 23);
range_check(offsetMinute, 0, 59);
}
}
/**
* @see https://tools.ietf.org/html/rfc3339#section-5.6
*
* @verbatim
* date-fullyear = 4DIGIT
* date-month = 2DIGIT ; 01-12
* date-mday = 2DIGIT ; 01-28, 01-29, 01-30, 01-31 based on
* ; month/year
* time-hour = 2DIGIT ; 00-23
* time-minute = 2DIGIT ; 00-59
* time-second = 2DIGIT ; 00-58, 00-59, 00-60 based on leap second
* ; rules
* time-secfrac = "." 1*DIGIT
* time-numoffset = ("+" / "-") time-hour ":" time-minute
* time-offset = "Z" / time-numoffset
*
* partial-time = time-hour ":" time-minute ":" time-second
* [time-secfrac]
* full-date = date-fullyear "-" date-month "-" date-mday
* full-time = partial-time time-offset
*
* date-time = full-date "T" full-time
* @endverbatim
* NOTE: Per [ABNF] and ISO8601, the "T" and "Z" characters in this
* syntax may alternatively be lower case "t" or "z" respectively.
*/
void rfc3339_date_time_check(const std::string &value)
{
const static std::regex dateTimeRegex{R"(^([0-9]{4}\-[0-9]{2}\-[0-9]{2})[Tt]([0-9]{2}\:[0-9]{2}\:[0-9]{2}(?:\.[0-9]+)?(?:[Zz]|(?:\+|\-)[0-9]{2}\:[0-9]{2}))$)"};
std::smatch matches;
if (!std::regex_match(value, matches, dateTimeRegex)) {
throw std::invalid_argument(value + " is not a date-time string according to RFC 3339.");
}
rfc3339_date_check(matches[1].str());
rfc3339_time_check(matches[2].str());
}
const std::string decOctet{R"((?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))"}; // matches numbers 0-255
const std::string ipv4Address{"(?:" + decOctet + R"(\.){3})" + decOctet};
const std::string h16{R"([0-9A-Fa-f]{1,4})"};
const std::string h16Left{"(?:" + h16 + ":)"};
const std::string ipv6Address{
"(?:"
"(?:" +
h16Left + "{6}"
"|::" +
h16Left + "{5}"
"|(?:" +
h16 + ")?::" + h16Left + "{4}"
"|(?:" +
h16Left + "{0,1}" + h16 + ")?::" + h16Left + "{3}"
"|(?:" +
h16Left + "{0,2}" + h16 + ")?::" + h16Left + "{2}"
"|(?:" +
h16Left + "{0,3}" + h16 + ")?::" + h16Left +
"|(?:" + h16Left + "{0,4}" + h16 + ")?::"
")(?:" +
h16Left + h16 + "|" + ipv4Address + ")"
"|(?:" +
h16Left + "{0,5}" + h16 + ")?::" + h16 +
"|(?:" + h16Left + "{0,6}" + h16 + ")?::"
")"};
const std::string ipvFuture{R"([Vv][0-9A-Fa-f]+\.[A-Za-z0-9\-._~!$&'()*+,;=:]+)"};
const std::string regName{R"((?:[A-Za-z0-9\-._~!$&'()*+,;=]|%[0-9A-Fa-f]{2})*)"};
const std::string host{
"(?:"
R"(\[(?:)" +
ipv6Address + "|" + ipvFuture + R"()\])" +
"|" + ipv4Address +
"|" + regName +
")"};
// from http://stackoverflow.com/questions/106179/regular-expression-to-match-dns-hostname-or-ip-address
const std::string hostname{R"(^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])(\.([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9]))*$)"};
/**
* @see https://tools.ietf.org/html/rfc5322#section-4.1
*
* @verbatim
* atom = [CFWS] 1*atext [CFWS]
* word = atom / quoted-string
* phrase = 1*word / obs-phrase
* obs-FWS = 1*WSP *(CRLF 1*WSP)
* FWS = ([*WSP CRLF] 1*WSP) / obs-FWS
* ; Folding white space
* ctext = %d33-39 / ; Printable US-ASCII
* %d42-91 / ; characters not including
* %d93-126 / ; "(", ")", or "\"
* obs-ctext
* ccontent = ctext / quoted-pair / comment
* comment = "(" *([FWS] ccontent) [FWS] ")"
* CFWS = (1*([FWS] comment) [FWS]) / FWS
* obs-local-part = word *("." word)
* obs-domain = atom *("." atom)
* obs-dtext = obs-NO-WS-CTL / quoted-pair
* quoted-pair = ("\" (VCHAR / WSP)) / obs-qp
* obs-NO-WS-CTL = %d1-8 / ; US-ASCII control
* %d11 / ; characters that do not
* %d12 / ; include the carriage
* %d14-31 / ; return, line feed, and
* %d127 ; white space characters
* obs-ctext = obs-NO-WS-CTL
* obs-qtext = obs-NO-WS-CTL
* obs-utext = %d0 / obs-NO-WS-CTL / VCHAR
* obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR)
* obs-body = *((*LF *CR *((%d0 / text) *LF *CR)) / CRLF)
* obs-unstruct = *((*LF *CR *(obs-utext *LF *CR)) / FWS)
* obs-phrase = word *(word / "." / CFWS)
* obs-phrase-list = [phrase / CFWS] *("," [phrase / CFWS])
* qtext = %d33 / ; Printable US-ASCII
* %d35-91 / ; characters not including
* %d93-126 / ; "\" or the quote character
* obs-qtext
* qcontent = qtext / quoted-pair
* quoted-string = [CFWS]
* DQUOTE *([FWS] qcontent) [FWS] DQUOTE
* [CFWS]
* atext = ALPHA / DIGIT / ; Printable US-ASCII
* "!" / "#" / ; characters not including
* "$" / "%" / ; specials. Used for atoms.
* "&" / "'" /
* "*" / "+" /
* "-" / "/" /
* "=" / "?" /
* "^" / "_" /
* "`" / "{" /
* "|" / "}" /
* "~"
* dot-atom-text = 1*atext *("." 1*atext)
* dot-atom = [CFWS] dot-atom-text [CFWS]
* addr-spec = local-part "@" domain
* local-part = dot-atom / quoted-string / obs-local-part
* domain = dot-atom / domain-literal / obs-domain
* domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
* dtext = %d33-90 / ; Printable US-ASCII
* %d94-126 / ; characters not including
* obs-dtext ; "[", "]", or "\"
* @endverbatim
* @todo Currently don't have a working tool for this larger ABNF to generate a regex.
* Other options:
* - https://github.com/ldthomas/apg-6.3
* - https://github.com/akr/abnf
*
* The problematic thing are the allowed whitespaces (even newlines) in the email.
* Ignoring those and starting with
* @see https://stackoverflow.com/questions/13992403/regex-validation-of-email-addresses-according-to-rfc5321-rfc5322
* and trying to divide up the complicated regex into understandable ABNF definitions from rfc5322 yields:
*/
const std::string obsnowsctl{R"([\x01-\x08\x0b\x0c\x0e-\x1f\x7f])"};
const std::string obsqp{R"(\\[\x01-\x09\x0b\x0c\x0e-\x7f])"};
const std::string qtext{R"((?:[\x21\x23-\x5b\x5d-\x7e]|)" + obsnowsctl + ")"};
const std::string dtext{R"([\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f])"};
const std::string quotedString{R"("(?:)" + qtext + "|" + obsqp + R"()*")"};
const std::string atext{R"([A-Za-z0-9!#$%&'*+/=?^_`{|}~-])"};
const std::string domainLiteral{R"(\[(?:(?:)" + decOctet + R"()\.){3}(?:)" + decOctet + R"(|[A-Za-z0-9-]*[A-Za-z0-9]:(?:)" + dtext + "|" + obsqp + R"()+)\])"};
const std::string dotAtom{"(?:" + atext + R"(+(?:\.)" + atext + "+)*)"};
const std::string stackoverflowMagicPart{R"((?:[[:alnum:]](?:[[:alnum:]-]*[[:alnum:]])?\.)+)"
R"([[:alnum:]](?:[[:alnum:]-]*[[:alnum:]])?)"};
const std::string email{"(?:" + dotAtom + "|" + quotedString + ")@(?:" + stackoverflowMagicPart + "|" + domainLiteral + ")"};
} // namespace
namespace nlohmann
{
namespace json_schema
{
/**
* Checks validity for built-ins by converting the definitions given as ABNF in the linked RFC from
* @see https://json-schema.org/understanding-json-schema/reference/string.html#built-in-formats
* into regular expressions using @see https://www.msweet.org/abnf/ and some manual editing.
*
* @see https://json-schema.org/latest/json-schema-validation.html
*/
void default_string_format_check(const std::string &format, const std::string &value)
{
if (format == "date-time") {
rfc3339_date_time_check(value);
} else if (format == "date") {
rfc3339_date_check(value);
} else if (format == "time") {
rfc3339_time_check(value);
} else if (format == "email") {
static const std::regex emailRegex{email};
if (!std::regex_match(value, emailRegex)) {
throw std::invalid_argument(value + " is not a valid email according to RFC 5322.");
}
} else if (format == "hostname") {
static const std::regex hostRegex{hostname};
if (!std::regex_match(value, hostRegex)) {
throw std::invalid_argument(value + " is not a valid hostname according to RFC 3986 Appendix A.");
}
} else if (format == "ipv4") {
const static std::regex ipv4Regex{"^" + ipv4Address + "$"};
if (!std::regex_match(value, ipv4Regex)) {
throw std::invalid_argument(value + " is not an IPv4 string according to RFC 2673.");
}
} else if (format == "ipv6") {
static const std::regex ipv6Regex{ipv6Address};
if (!std::regex_match(value, ipv6Regex)) {
throw std::invalid_argument(value + " is not an IPv6 string according to RFC 5954.");
}
} else if (format == "regex") {
try {
std::regex re(value, std::regex::ECMAScript);
} catch (std::exception &exception) {
throw exception;
}
} else {
/* yet unsupported JSON schema draft 7 built-ins */
static const std::vector<std::string> jsonSchemaStringFormatBuiltIns{
"date-time", "time", "date", "email", "idn-email", "hostname", "idn-hostname", "ipv4", "ipv6", "uri",
"uri-reference", "iri", "iri-reference", "uri-template", "json-pointer", "relative-json-pointer", "regex"};
if (std::find(jsonSchemaStringFormatBuiltIns.begin(), jsonSchemaStringFormatBuiltIns.end(), format) != jsonSchemaStringFormatBuiltIns.end()) {
throw std::logic_error("JSON schema string format built-in " + format + " not yet supported. " +
"Please open an issue or use a custom format checker.");
}
throw std::logic_error("Don't know how to validate " + format);
}
}
} // namespace json_schema
} // namespace nlohmann

View File

@ -34,3 +34,10 @@ add_test(NAME issue-70 COMMAND issue-70)
add_executable(issue-70-root-schema-constructor issue-70-root-schema-constructor.cpp)
target_link_libraries(issue-70-root-schema-constructor json-schema-validator)
add_test(NAME issue-70-root-schema-constructor COMMAND issue-70-root-schema-constructor)
# Unit test for string format checks
add_executable("string-format-check-test" "string-format-check-test.cpp")
target_include_directories("string-format-check-test" PRIVATE "${PROJECT_SOURCE_DIR}/src/")
target_link_libraries("string-format-check-test" json-schema-validator)
add_test(NAME "string-format-check-test" COMMAND "string-format-check-test")

View File

@ -54,16 +54,12 @@ if(JSON_SCHEMA_TEST_SUITE_PATH)
JSON-Suite::Optional::zeroTerminatedFloats
JSON-Suite::Optional::ecmascript-regex
JSON-Suite::Optional::Format::date-time
JSON-Suite::Optional::Format::date
JSON-Suite::Optional::Format::email
JSON-Suite::Optional::Format::idn-email
JSON-Suite::Optional::Format::idn-hostname
JSON-Suite::Optional::Format::iri-reference
JSON-Suite::Optional::Format::iri
JSON-Suite::Optional::Format::json-pointer
JSON-Suite::Optional::Format::relative-json-pointer
JSON-Suite::Optional::Format::time
JSON-Suite::Optional::Format::uri-reference
JSON-Suite::Optional::Format::uri-template
JSON-Suite::Optional::Format::uri

View File

@ -16,34 +16,6 @@ using nlohmann::json;
using nlohmann::json_uri;
using nlohmann::json_schema::json_validator;
static void format_check(const std::string &format, const std::string &value)
{
if (format == "hostname") {
// from http://stackoverflow.com/questions/106179/regular-expression-to-match-dns-hostname-or-ip-address
std::regex re(R"(^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])(\.([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9]))*$)");
if (!std::regex_match(value, re))
throw std::invalid_argument(value + " is not a valid hostname.");
} else if (format == "ipv4") {
std::regex re(R"(^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$)");
if (!std::regex_match(value, re))
throw std::invalid_argument(value + " is not a IPv4-address.");
} else if (format == "ipv6") {
std::regex re(R"((([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])))");
if (!std::regex_match(value, re))
throw std::invalid_argument(value + " is not a IPv6-address.");
} else if (format == "regex") {
try {
std::regex re(value, std::regex::ECMAScript);
} catch (std::exception &e) {
throw e;
}
} else
throw std::logic_error("don't know how to validate " + format);
}
static void loader(const json_uri &uri, json &schema)
{
if (uri.location() == "http://json-schema.org/draft-07/schema") {
@ -89,7 +61,8 @@ int main(void)
const auto &schema = test_group["schema"];
json_validator validator(loader, format_check);
json_validator validator(loader,
nlohmann::json_schema::default_string_format_check);
validator.set_root_schema(schema);

View File

@ -0,0 +1,85 @@
#include <iostream>
#include <json-schema.hpp>
/** @return number of failed tests */
size_t
testStringFormat(const std::string &format,
const std::vector<std::pair<std::string, bool>> &stringValidTests)
{
size_t numberOfErrors = 0;
for (auto stringValid = stringValidTests.begin(); stringValid != stringValidTests.end(); ++stringValid) {
std::cout << "[INFO] Testing " << format << ": " << stringValid->first << "\n";
try {
nlohmann::json_schema::default_string_format_check(format, stringValid->first);
if (!stringValid->second) {
++numberOfErrors;
std::cerr << "[ERROR] String with " << format << " format '" << stringValid->first
<< "' validated even though it should NOT!\n";
}
} catch (std::exception &exception) {
std::cout << "[INFO] Validation failed with: " << exception.what() << "\n";
if (stringValid->second) {
++numberOfErrors;
std::cerr << "[ERROR] String with " << format << " format '" << stringValid->first
<< "' did NOT validate even though it should!\n";
}
}
}
return numberOfErrors;
}
int main()
{
size_t numberOfErrors = 0;
const std::vector<std::pair<std::string, bool>> dateTimeChecks{
{"1985-04-12T23:20:50.52Z", true},
{"1996-12-19T16:39:57-08:00", true},
{"1990-12-31T23:59:60Z", true},
{"1990-12-31T15:59:60-08:00", true},
{"1937-01-01T12:00:27.87+00:20", true},
{"1985-4-12T23:20:50.52Z", false},
{"1985-04-12T23:20:50.52", false},
{"1985-04-12T24:00:00", false},
{"", false},
{"2019-04-30T11:11:11+01:00", true},
{"2019-04-31T11:11:11+01:00", false},
{"2019-02-28T11:11:11+01:00", true},
{"2019-02-29T11:11:11+01:00", false},
{"2020-02-29T11:11:11+01:00", true},
{"2020-02-30T11:11:11+01:00", false},
{"2020-02-29T23:59:59+01:00", true},
{"2020-02-29T23:59:60+01:00", true},
{"2020-02-29T23:60:59+01:00", false},
{"2019-09-30T11:11:11+01:00", true},
{"2019-09-31T11:11:11+01:00", false},
{"2019-09-30T11:11:11+23:59", true},
{"2019-09-30T11:11:11+24:00", false}};
numberOfErrors += testStringFormat("date-time", dateTimeChecks);
const std::vector<std::pair<std::string, bool>> ipv4Checks{
{"", false},
{"x99.99.99.99", false},
{"99.99.99.99x", false},
{"192.168.0.1", true},
{"127.0.0", false},
{"127.0.0.1", true},
{"127.0.0.0.1", false},
{"255.255.255.255", true},
{"255.255.255.256", false},
{"255.255.256.255", false},
{"255.256.255.255", false},
{"256.255.255.255", false},
{"256.256.256.256", false},
{"0x7f000001", false}};
numberOfErrors += testStringFormat("ipv4", ipv4Checks);
return numberOfErrors;
}