urlapi: add CURLU_PUNYCODE

Allows curl_url_get() get the punycode version of host names for the
host name and URL parts.

Extend test 1560 to verify.

Closes #10109
This commit is contained in:
Daniel Stenberg 2022-12-26 10:58:37 +01:00
parent cf174810db
commit 901392cbb7
No known key found for this signature in database
GPG Key ID: 5CC908FDB71E12C2
10 changed files with 90 additions and 14 deletions

View File

@ -573,6 +573,7 @@ PSL
pthreads pthreads
PTR PTR
ptr ptr
punycode
py py
pycurl pycurl
QNX QNX

View File

@ -76,6 +76,17 @@ typically using non-ASCII bytes that otherwise will be percent-encoded.
Note that even when not asking for URL encoding, the '%' (byte 37) will be URL Note that even when not asking for URL encoding, the '%' (byte 37) will be URL
encoded to make sure the host name remains valid. encoded to make sure the host name remains valid.
.IP CURLU_PUNYCODE
If set and \fICURLU_URLENCODE\fP is not set, and asked to retrieve the
\fBCURLUPART_HOST\fP or \fBCURLUPART_URL\fP parts, libcurl returns the host
name in its punycode version if it contains any non-ASCII octets (and is an
IDN name).
If libcurl is built without IDN capabilities, using this bit will make
\fIcurl_url_get(3)\fP return \fICURLUE_LACKS_IDN\fP if the host name contains
anything outside the ASCII range.
(Added in curl 7.88.0)
.SH PARTS .SH PARTS
.IP CURLUPART_URL .IP CURLUPART_URL
When asked to return the full URL, \fIcurl_url_get(3)\fP will return a When asked to return the full URL, \fIcurl_url_get(3)\fP will return a

View File

@ -1055,6 +1055,7 @@ CURLU_NO_AUTHORITY 7.67.0
CURLU_NO_DEFAULT_PORT 7.62.0 CURLU_NO_DEFAULT_PORT 7.62.0
CURLU_NON_SUPPORT_SCHEME 7.62.0 CURLU_NON_SUPPORT_SCHEME 7.62.0
CURLU_PATH_AS_IS 7.62.0 CURLU_PATH_AS_IS 7.62.0
CURLU_PUNYCODE 7.88.0
CURLU_URLDECODE 7.62.0 CURLU_URLDECODE 7.62.0
CURLU_URLENCODE 7.62.0 CURLU_URLENCODE 7.62.0
CURLUE_BAD_FILE_URL 7.81.0 CURLUE_BAD_FILE_URL 7.81.0
@ -1071,6 +1072,7 @@ CURLUE_BAD_QUERY 7.81.0
CURLUE_BAD_SCHEME 7.81.0 CURLUE_BAD_SCHEME 7.81.0
CURLUE_BAD_SLASHES 7.81.0 CURLUE_BAD_SLASHES 7.81.0
CURLUE_BAD_USER 7.81.0 CURLUE_BAD_USER 7.81.0
CURLUE_LACKS_IDN 7.88.0
CURLUE_MALFORMED_INPUT 7.62.0 CURLUE_MALFORMED_INPUT 7.62.0
CURLUE_NO_FRAGMENT 7.62.0 CURLUE_NO_FRAGMENT 7.62.0
CURLUE_NO_HOST 7.62.0 CURLUE_NO_HOST 7.62.0

View File

@ -62,6 +62,7 @@ typedef enum {
CURLUE_BAD_SCHEME, /* 27 */ CURLUE_BAD_SCHEME, /* 27 */
CURLUE_BAD_SLASHES, /* 28 */ CURLUE_BAD_SLASHES, /* 28 */
CURLUE_BAD_USER, /* 29 */ CURLUE_BAD_USER, /* 29 */
CURLUE_LACKS_IDN, /* 30 */
CURLUE_LAST CURLUE_LAST
} CURLUcode; } CURLUcode;
@ -95,6 +96,7 @@ typedef enum {
#define CURLU_NO_AUTHORITY (1<<10) /* Allow empty authority when the #define CURLU_NO_AUTHORITY (1<<10) /* Allow empty authority when the
scheme is unknown. */ scheme is unknown. */
#define CURLU_ALLOW_SPACE (1<<11) /* Allow spaces in the URL */ #define CURLU_ALLOW_SPACE (1<<11) /* Allow spaces in the URL */
#define CURLU_PUNYCODE (1<<12) /* get the host name in pynycode */
typedef struct Curl_URL CURLU; typedef struct Curl_URL CURLU;

View File

@ -116,7 +116,7 @@ bool Curl_is_ASCII_name(const char *hostname)
* Curl_idn_decode() returns an allocated IDN decoded string if it was * Curl_idn_decode() returns an allocated IDN decoded string if it was
* possible. NULL on error. * possible. NULL on error.
*/ */
static char *Curl_idn_decode(const char *input) static char *idn_decode(const char *input)
{ {
char *decoded = NULL; char *decoded = NULL;
#ifdef USE_LIBIDN2 #ifdef USE_LIBIDN2
@ -144,24 +144,29 @@ static char *Curl_idn_decode(const char *input)
return decoded; return decoded;
} }
char *Curl_idn_decode(const char *input)
{
char *d = idn_decode(input);
#ifdef USE_LIBIDN2
if(d) {
char *c = strdup(d);
idn2_free(d);
d = c;
}
#endif
return d;
}
/* /*
* Frees data allocated by idnconvert_hostname() * Frees data allocated by idnconvert_hostname()
*/ */
void Curl_free_idnconverted_hostname(struct hostname *host) void Curl_free_idnconverted_hostname(struct hostname *host)
{ {
#if defined(USE_LIBIDN2)
if(host->encalloc) { if(host->encalloc) {
idn2_free(host->encalloc); /* must be freed with idn2_free() since this was /* must be freed with idn2_free() if allocated by libidn */
allocated by libidn */ Curl_idn_free(host->encalloc);
host->encalloc = NULL; host->encalloc = NULL;
} }
#elif defined(USE_WIN32_IDN)
free(host->encalloc); /* must be freed with free() since this was
allocated by Curl_win32_idn_to_ascii */
host->encalloc = NULL;
#else
(void)host;
#endif
} }
#endif /* USE_IDN */ #endif /* USE_IDN */
@ -177,7 +182,7 @@ CURLcode Curl_idnconvert_hostname(struct hostname *host)
#ifdef USE_IDN #ifdef USE_IDN
/* Check name for non-ASCII and convert hostname if we can */ /* Check name for non-ASCII and convert hostname if we can */
if(!Curl_is_ASCII_name(host->name)) { if(!Curl_is_ASCII_name(host->name)) {
char *decoded = Curl_idn_decode(host->name); char *decoded = idn_decode(host->name);
if(decoded) { if(decoded) {
/* successful */ /* successful */
host->encalloc = decoded; host->encalloc = decoded;
@ -190,4 +195,3 @@ CURLcode Curl_idnconvert_hostname(struct hostname *host)
#endif #endif
return CURLE_OK; return CURLE_OK;
} }

View File

@ -32,7 +32,15 @@ CURLcode Curl_idnconvert_hostname(struct hostname *host);
#if defined(USE_LIBIDN2) || defined(USE_WIN32_IDN) #if defined(USE_LIBIDN2) || defined(USE_WIN32_IDN)
#define USE_IDN #define USE_IDN
void Curl_free_idnconverted_hostname(struct hostname *host); void Curl_free_idnconverted_hostname(struct hostname *host);
char *Curl_idn_decode(const char *input);
#ifdef USE_LIBIDN2
#define Curl_idn_free(x) idn2_free(x)
#else
#define Curl_idn_free(x) free(x)
#endif
#else #else
#define Curl_free_idnconverted_hostname(x) #define Curl_free_idnconverted_hostname(x)
#define Curl_idn_decode(x) NULL
#endif #endif
#endif /* HEADER_CURL_IDN_H */ #endif /* HEADER_CURL_IDN_H */

View File

@ -550,6 +550,9 @@ curl_url_strerror(CURLUcode error)
case CURLUE_BAD_USER: case CURLUE_BAD_USER:
return "Bad user"; return "Bad user";
case CURLUE_LACKS_IDN:
return "libcurl lacks IDN support";
case CURLUE_LAST: case CURLUE_LAST:
break; break;
} }

View File

@ -33,6 +33,7 @@
#include "inet_pton.h" #include "inet_pton.h"
#include "inet_ntop.h" #include "inet_ntop.h"
#include "strdup.h" #include "strdup.h"
#include "idn.h"
/* The last 3 #include files should be in this order */ /* The last 3 #include files should be in this order */
#include "curl_printf.h" #include "curl_printf.h"
@ -1379,6 +1380,7 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
char portbuf[7]; char portbuf[7];
bool urldecode = (flags & CURLU_URLDECODE)?1:0; bool urldecode = (flags & CURLU_URLDECODE)?1:0;
bool urlencode = (flags & CURLU_URLENCODE)?1:0; bool urlencode = (flags & CURLU_URLENCODE)?1:0;
bool punycode = FALSE;
bool plusdecode = FALSE; bool plusdecode = FALSE;
(void)flags; (void)flags;
if(!u) if(!u)
@ -1408,6 +1410,7 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
case CURLUPART_HOST: case CURLUPART_HOST:
ptr = u->host; ptr = u->host;
ifmissing = CURLUE_NO_HOST; ifmissing = CURLUE_NO_HOST;
punycode = (flags & CURLU_PUNYCODE)?1:0;
break; break;
case CURLUPART_ZONEID: case CURLUPART_ZONEID:
ptr = u->zoneid; ptr = u->zoneid;
@ -1460,6 +1463,7 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
char *options = u->options; char *options = u->options;
char *port = u->port; char *port = u->port;
char *allochost = NULL; char *allochost = NULL;
punycode = (flags & CURLU_PUNYCODE)?1:0;
if(u->scheme && strcasecompare("file", u->scheme)) { if(u->scheme && strcasecompare("file", u->scheme)) {
url = aprintf("file://%s%s%s", url = aprintf("file://%s%s%s",
u->path, u->path,
@ -1514,6 +1518,17 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
if(!allochost) if(!allochost)
return CURLUE_OUT_OF_MEMORY; return CURLUE_OUT_OF_MEMORY;
} }
else if(punycode) {
if(!Curl_is_ASCII_name(u->host)) {
#ifndef USE_IDN
return CURLUE_LACKS_IDN;
#else
allochost = Curl_idn_decode(u->host);
if(!allochost)
return CURLUE_OUT_OF_MEMORY;
#endif
}
}
else { else {
/* only encode '%' in output host name */ /* only encode '%' in output host name */
char *host = u->host; char *host = u->host;
@ -1611,6 +1626,19 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
free(*part); free(*part);
*part = Curl_dyn_ptr(&enc); *part = Curl_dyn_ptr(&enc);
} }
else if(punycode) {
if(!Curl_is_ASCII_name(u->host)) {
#ifndef USE_IDN
return CURLUE_LACKS_IDN;
#else
char *allochost = Curl_idn_decode(*part);
if(!allochost)
return CURLUE_OUT_OF_MEMORY;
free(*part);
*part = allochost;
#endif
}
}
return CURLUE_OK; return CURLUE_OK;
} }

View File

@ -185,7 +185,8 @@ u26: Bad query
u27: Bad scheme u27: Bad scheme
u28: Unsupported number of slashes following scheme u28: Unsupported number of slashes following scheme
u29: Bad user u29: Bad user
u30: CURLUcode unknown u30: libcurl lacks IDN support
u31: CURLUcode unknown
</stdout> </stdout>
</verify> </verify>

View File

@ -31,6 +31,9 @@
*/ */
#include "test.h" #include "test.h"
#if defined(USE_LIBIDN2) || defined(USE_WIN32_IDN)
#define USE_IDN
#endif
#include "testutil.h" #include "testutil.h"
#include "warnless.h" #include "warnless.h"
@ -138,6 +141,15 @@ struct clearurlcase {
}; };
static const struct testcase get_parts_list[] ={ static const struct testcase get_parts_list[] ={
#ifdef USE_IDN
{"https://räksmörgås.se",
"https | [11] | [12] | [13] | xn--rksmrgs-5wao1o.se | "
"[15] | / | [16] | [17]", 0, CURLU_PUNYCODE, CURLUE_OK},
#else
{"https://räksmörgås.se",
"https | [11] | [12] | [13] | [30] | [15] | / | [16] | [17]",
0, CURLU_PUNYCODE, CURLUE_OK},
#endif
/* https://ℂᵤⓇℒ。𝐒🄴 */ /* https://ℂᵤⓇℒ。𝐒🄴 */
{"https://" {"https://"
"%e2%84%82%e1%b5%a4%e2%93%87%e2%84%92%e3%80%82%f0%9d%90%92%f0%9f%84%b4", "%e2%84%82%e1%b5%a4%e2%93%87%e2%84%92%e3%80%82%f0%9d%90%92%f0%9f%84%b4",
@ -454,6 +466,10 @@ static const struct testcase get_parts_list[] ={
}; };
static const struct urltestcase get_url_list[] = { static const struct urltestcase get_url_list[] = {
#ifdef USE_IDN
{"https://räksmörgås.se/path?q#frag",
"https://xn--rksmrgs-5wao1o.se/path?q#frag", 0, CURLU_PUNYCODE, CURLUE_OK},
#endif
/* unsupported schemes with no guessing enabled */ /* unsupported schemes with no guessing enabled */
{"data:text/html;charset=utf-8;base64,PCFET0NUWVBFIEhUTUw+PG1ldGEgY", {"data:text/html;charset=utf-8;base64,PCFET0NUWVBFIEhUTUw+PG1ldGEgY",
"", 0, 0, CURLUE_UNSUPPORTED_SCHEME}, "", 0, 0, CURLUE_UNSUPPORTED_SCHEME},