urlapi: add CURLU_GET_EMPTY for empty queries and fragments

By default the API inhibits empty queries and fragments extracted.
Unless this new flag is set.

This also makes the behavior more consistent: without it set, zero
length queries and fragments are considered not present in the URL. With
the flag set, they are returned as a zero length strings if they were in
fact present in the URL.

This applies when extracting the individual query and fragment
components and for the full URL.

Closes #13396
This commit is contained in:
Daniel Stenberg 2024-04-17 11:39:25 +02:00
parent 5379dbc248
commit 3eac21d86b
No known key found for this signature in database
GPG Key ID: 5CC908FDB71E12C2
6 changed files with 77 additions and 10 deletions

View File

@ -113,6 +113,18 @@ punycode.
(Added in curl 8.3.0)
## CURLU_GET_EMPTY
When this flag is used in curl_url_get(), it makes the function return empty
query and fragments parts or when used in the full URL. By default, libcurl
otherwise considers empty parts non-existing.
An empty query part is one where this is nothing following the question mark
(before the possible fragment). An empty fragments part is one where there is
nothing following the hash sign.
(Added in curl 8.8.0)
# PARTS
## CURLUPART_URL

View File

@ -1064,6 +1064,7 @@ CURLU_APPENDQUERY 7.62.0
CURLU_DEFAULT_PORT 7.62.0
CURLU_DEFAULT_SCHEME 7.62.0
CURLU_DISALLOW_USER 7.62.0
CURLU_GET_EMPTY 8.8.0
CURLU_GUESS_SCHEME 7.62.0
CURLU_NO_AUTHORITY 7.67.0
CURLU_NO_DEFAULT_PORT 7.62.0

View File

@ -99,6 +99,9 @@ typedef enum {
#define CURLU_ALLOW_SPACE (1<<11) /* Allow spaces in the URL */
#define CURLU_PUNYCODE (1<<12) /* get the host name in punycode */
#define CURLU_PUNY2IDN (1<<13) /* punycode => IDN conversion */
#define CURLU_GET_EMPTY (1<<14) /* allow empty queries and fragments
when extracting the URL or the
components */
typedef struct Curl_URL CURLU;

View File

@ -79,7 +79,9 @@ struct Curl_URL {
char *path;
char *query;
char *fragment;
long portnum; /* the numerical version */
unsigned short portnum; /* the numerical version */
BIT(query_present); /* to support blank */
BIT(fragment_present); /* to support blank */
};
#define DEFAULT_SCHEME "https"
@ -561,7 +563,7 @@ UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, struct dynbuf *host,
if(rest[0])
return CURLUE_BAD_PORT_NUMBER;
u->portnum = port;
u->portnum = (unsigned short) port;
/* generate a new port number string to get rid of leading zeroes etc */
free(u->port);
u->port = aprintf("%ld", port);
@ -1245,6 +1247,7 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
fragment = strchr(path, '#');
if(fragment) {
fraglen = pathlen - (fragment - path);
u->fragment_present = TRUE;
if(fraglen > 1) {
/* skip the leading '#' in the copy but include the terminating null */
if(flags & CURLU_URLENCODE) {
@ -1272,6 +1275,7 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
size_t qlen = fragment ? (size_t)(fragment - query) :
pathlen - (query - path);
pathlen -= qlen;
u->query_present = TRUE;
if(qlen > 1) {
if(flags & CURLU_URLENCODE) {
struct dynbuf enc;
@ -1407,6 +1411,8 @@ CURLU *curl_url_dup(const CURLU *in)
DUP(u, in, fragment);
DUP(u, in, zoneid);
u->portnum = in->portnum;
u->fragment_present = in->fragment_present;
u->query_present = in->query_present;
}
return u;
fail:
@ -1491,10 +1497,16 @@ CURLUcode curl_url_get(const CURLU *u, CURLUPart what,
ptr = u->query;
ifmissing = CURLUE_NO_QUERY;
plusdecode = urldecode;
if(ptr && !ptr[0] && !(flags & CURLU_GET_EMPTY))
/* there was a blank query and the user do not ask for it */
ptr = NULL;
break;
case CURLUPART_FRAGMENT:
ptr = u->fragment;
ifmissing = CURLUE_NO_FRAGMENT;
if(!ptr && u->fragment_present && flags & CURLU_GET_EMPTY)
/* there was a blank fragment and the user asks for it */
ptr = "";
break;
case CURLUPART_URL: {
char *url;
@ -1502,13 +1514,18 @@ CURLUcode curl_url_get(const CURLU *u, CURLUPart what,
char *options = u->options;
char *port = u->port;
char *allochost = NULL;
bool show_fragment =
u->fragment || (u->fragment_present && flags & CURLU_GET_EMPTY);
bool show_query =
(u->query && u->query[0]) ||
(u->query_present && flags & CURLU_GET_EMPTY);
punycode = (flags & CURLU_PUNYCODE)?1:0;
depunyfy = (flags & CURLU_PUNY2IDN)?1:0;
if(u->scheme && strcasecompare("file", u->scheme)) {
url = aprintf("file://%s%s%s",
u->path,
u->fragment? "#": "",
u->fragment? u->fragment : "");
show_fragment ? "#": "",
u->fragment ? u->fragment : "");
}
else if(!u->host)
return CURLUE_NO_HOST;
@ -1596,9 +1613,9 @@ CURLUcode curl_url_get(const CURLU *u, CURLUPart what,
port ? ":": "",
port ? port : "",
u->path ? u->path : "/",
(u->query && u->query[0]) ? "?": "",
(u->query && u->query[0]) ? u->query : "",
u->fragment? "#": "",
show_query ? "?": "",
u->query ? u->query : "",
show_fragment ? "#": "",
u->fragment? u->fragment : "");
free(allochost);
}
@ -1733,9 +1750,11 @@ CURLUcode curl_url_set(CURLU *u, CURLUPart what,
break;
case CURLUPART_QUERY:
storep = &u->query;
u->query_present = FALSE;
break;
case CURLUPART_FRAGMENT:
storep = &u->fragment;
u->fragment_present = FALSE;
break;
default:
return CURLUE_UNKNOWN_PART;
@ -1819,9 +1838,11 @@ CURLUcode curl_url_set(CURLU *u, CURLUPart what,
appendquery = (flags & CURLU_APPENDQUERY)?1:0;
equalsencode = appendquery;
storep = &u->query;
u->query_present = TRUE;
break;
case CURLUPART_FRAGMENT:
storep = &u->fragment;
u->fragment_present = TRUE;
break;
case CURLUPART_URL: {
/*
@ -1972,6 +1993,6 @@ nomem:
/* set after the string, to make it not assigned if the allocation above
fails */
if(port)
u->portnum = port;
u->portnum = (unsigned short)port;
return CURLUE_OK;
}

View File

@ -25,7 +25,7 @@ gopher
Gopher selector
</name>
<command>
gopher://%HOSTIP:%GOPHERPORT/1/selector/SELECTOR/%TESTNUMBER?
gopher://%HOSTIP:%GOPHERPORT/1/selector/SELECTOR/%TESTNUMBER
</command>
</client>
@ -33,7 +33,7 @@ gopher://%HOSTIP:%GOPHERPORT/1/selector/SELECTOR/%TESTNUMBER?
# Verify data after the test has been "shot"
<verify>
<protocol>
/selector/SELECTOR/%TESTNUMBER?
/selector/SELECTOR/%TESTNUMBER
</protocol>
</verify>
</testcase>

View File

@ -151,6 +151,21 @@ struct clearurlcase {
};
static const struct testcase get_parts_list[] ={
{"https://curl.se/#",
"https | [11] | [12] | [13] | curl.se | [15] | / | [16] | ",
0, CURLU_GET_EMPTY, CURLUE_OK},
{"https://curl.se/?#",
"https | [11] | [12] | [13] | curl.se | [15] | / | | ",
0, CURLU_GET_EMPTY, CURLUE_OK},
{"https://curl.se/?",
"https | [11] | [12] | [13] | curl.se | [15] | / | | [17]",
0, CURLU_GET_EMPTY, CURLUE_OK},
{"https://curl.se/?",
"https | [11] | [12] | [13] | curl.se | [15] | / | [16] | [17]",
0, 0, CURLUE_OK},
{"https://curl.se/?#",
"https | [11] | [12] | [13] | curl.se | [15] | / | [16] | [17]",
0, 0, CURLUE_OK},
{"https://curl.se/# ",
"https | [11] | [12] | [13] | curl.se | [15] | / | [16] | %20%20",
CURLU_URLENCODE|CURLU_ALLOW_SPACE, 0, CURLUE_OK},
@ -508,6 +523,9 @@ static const struct testcase get_parts_list[] ={
};
static const struct urltestcase get_url_list[] = {
{"http://user@example.com?#",
"http://user@example.com/?#",
0, CURLU_GET_EMPTY, CURLUE_OK},
/* WHATWG disgrees, it wants "https:/0.0.0.0/" */
{"https://0x.0x.0", "https://0x.0x.0/", 0, 0, CURLUE_OK},
@ -781,6 +799,18 @@ static int checkurl(const char *org, const char *url, const char *out)
3. Extract all components (not URL)
*/
static const struct setgetcase setget_parts_list[] = {
{"https://example.com/",
"query=\"\",",
"https | [11] | [12] | [13] | example.com | [15] | / | | [17]",
0, 0, CURLU_GET_EMPTY, CURLUE_OK},
{"https://example.com/",
"fragment=\"\",",
"https | [11] | [12] | [13] | example.com | [15] | / | [16] | ",
0, 0, CURLU_GET_EMPTY, CURLUE_OK},
{"https://example.com/",
"query=\"\",",
"https | [11] | [12] | [13] | example.com | [15] | / | [16] | [17]",
0, 0, 0, CURLUE_OK},
{"https://example.com",
"path=get,",
"https | [11] | [12] | [13] | example.com | [15] | /get | [16] | [17]",