tool_cb_wrt: fix invalid unicode for windows console
- Suppress an incomplete UTF-8 sequence at the end of the buffer. - Attempt to reconstruct incomplete UTF-8 sequence from prior call(s) in current call. Prior to this change, in Windows console UTF-8 sequences split between two or more calls to the write callback would cause invalid "replacement characters" U+FFFD to be printed instead of the actual Unicode character. This is because in Windows only UTF-16 encoded characters are printed to the console, therefore we convert the UTF-8 contents to UTF-16, which cannot be done with partial UTF-8 sequences. Reported-by: Maksim Arhipov Fixes https://github.com/curl/curl/issues/9841 Closes https://github.com/curl/curl/pull/10890
This commit is contained in:
parent
0b947e8ca2
commit
af3f4e419b
@ -87,6 +87,12 @@ size_t tool_header_cb(char *ptr, size_t size, size_t nmemb, void *userdata)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
/* Discard incomplete UTF-8 sequence buffered from body */
|
||||||
|
if(outs->utf8seq[0])
|
||||||
|
memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Write header data when curl option --dump-header (-D) is given.
|
* Write header data when curl option --dump-header (-D) is given.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -233,35 +233,132 @@ size_t tool_write_cb(char *buffer, size_t sz, size_t nmemb, void *userdata)
|
|||||||
|
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
fhnd = _get_osfhandle(fileno(outs->stream));
|
fhnd = _get_osfhandle(fileno(outs->stream));
|
||||||
|
/* if windows console then UTF-8 must be converted to UTF-16 */
|
||||||
if(isatty(fileno(outs->stream)) &&
|
if(isatty(fileno(outs->stream)) &&
|
||||||
GetConsoleScreenBufferInfo((HANDLE)fhnd, &console_info)) {
|
GetConsoleScreenBufferInfo((HANDLE)fhnd, &console_info)) {
|
||||||
DWORD in_len = (DWORD)(sz * nmemb);
|
wchar_t *wc_buf;
|
||||||
wchar_t* wc_buf;
|
|
||||||
DWORD wc_len;
|
DWORD wc_len;
|
||||||
|
unsigned char *rbuf = (unsigned char *)buffer;
|
||||||
|
DWORD rlen = (DWORD)bytes;
|
||||||
|
|
||||||
/* calculate buffer size for wide characters */
|
#define IS_TRAILING_BYTE(x) (0x80 <= (x) && (x) < 0xC0)
|
||||||
wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len, NULL, 0);
|
|
||||||
wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t));
|
|
||||||
if(!wc_buf)
|
|
||||||
return CURL_WRITEFUNC_ERROR;
|
|
||||||
|
|
||||||
/* calculate buffer size for multi-byte characters */
|
/* attempt to complete an incomplete UTF-8 sequence from previous call.
|
||||||
wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len, wc_buf, wc_len);
|
the sequence does not have to be well-formed. */
|
||||||
if(!wc_len) {
|
if(outs->utf8seq[0] && rlen) {
|
||||||
free(wc_buf);
|
bool complete = false;
|
||||||
return CURL_WRITEFUNC_ERROR;
|
/* two byte sequence (lead byte 110yyyyy) */
|
||||||
|
if(0xC0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xE0) {
|
||||||
|
outs->utf8seq[1] = *rbuf++;
|
||||||
|
--rlen;
|
||||||
|
complete = true;
|
||||||
|
}
|
||||||
|
/* three byte sequence (lead byte 1110zzzz) */
|
||||||
|
else if(0xE0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF0) {
|
||||||
|
if(!outs->utf8seq[1]) {
|
||||||
|
outs->utf8seq[1] = *rbuf++;
|
||||||
|
--rlen;
|
||||||
|
}
|
||||||
|
if(rlen && !outs->utf8seq[2]) {
|
||||||
|
outs->utf8seq[2] = *rbuf++;
|
||||||
|
--rlen;
|
||||||
|
complete = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* four byte sequence (lead byte 11110uuu) */
|
||||||
|
else if(0xF0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF8) {
|
||||||
|
if(!outs->utf8seq[1]) {
|
||||||
|
outs->utf8seq[1] = *rbuf++;
|
||||||
|
--rlen;
|
||||||
|
}
|
||||||
|
if(rlen && !outs->utf8seq[2]) {
|
||||||
|
outs->utf8seq[2] = *rbuf++;
|
||||||
|
--rlen;
|
||||||
|
}
|
||||||
|
if(rlen && !outs->utf8seq[3]) {
|
||||||
|
outs->utf8seq[3] = *rbuf++;
|
||||||
|
--rlen;
|
||||||
|
complete = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(complete) {
|
||||||
|
WCHAR prefix[3] = {0}; /* UTF-16 (1-2 WCHARs) + NUL */
|
||||||
|
|
||||||
|
if(MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)outs->utf8seq, -1,
|
||||||
|
prefix, sizeof(prefix)/sizeof(prefix[0]))) {
|
||||||
|
DEBUGASSERT(prefix[3] == L'\0');
|
||||||
|
if(!WriteConsoleW(
|
||||||
|
(HANDLE) fhnd,
|
||||||
|
prefix,
|
||||||
|
prefix[1] ? 2 : 1,
|
||||||
|
NULL,
|
||||||
|
NULL)) {
|
||||||
|
return CURL_WRITEFUNC_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* else: UTF-8 input was not well formed and OS is pre-Vista which
|
||||||
|
drops invalid characters instead of writing U+FFFD to output. */
|
||||||
|
|
||||||
|
memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!WriteConsoleW(
|
/* suppress an incomplete utf-8 sequence at end of rbuf */
|
||||||
(HANDLE) fhnd,
|
if(!outs->utf8seq[0] && rlen && (rbuf[rlen - 1] & 0x80)) {
|
||||||
wc_buf,
|
/* check for lead byte from a two, three or four byte sequence */
|
||||||
wc_len,
|
if(0xC0 <= rbuf[rlen - 1] && rbuf[rlen - 1] < 0xF8) {
|
||||||
&wc_len,
|
outs->utf8seq[0] = rbuf[rlen - 1];
|
||||||
NULL)) {
|
rlen -= 1;
|
||||||
free(wc_buf);
|
}
|
||||||
return CURL_WRITEFUNC_ERROR;
|
else if(rlen >= 2 && IS_TRAILING_BYTE(rbuf[rlen - 1])) {
|
||||||
|
/* check for lead byte from a three or four byte sequence */
|
||||||
|
if(0xE0 <= rbuf[rlen - 2] && rbuf[rlen - 2] < 0xF8) {
|
||||||
|
outs->utf8seq[0] = rbuf[rlen - 2];
|
||||||
|
outs->utf8seq[1] = rbuf[rlen - 1];
|
||||||
|
rlen -= 2;
|
||||||
|
}
|
||||||
|
else if(rlen >= 3 && IS_TRAILING_BYTE(rbuf[rlen - 2])) {
|
||||||
|
/* check for lead byte from a four byte sequence */
|
||||||
|
if(0xF0 <= rbuf[rlen - 3] && rbuf[rlen - 3] < 0xF8) {
|
||||||
|
outs->utf8seq[0] = rbuf[rlen - 3];
|
||||||
|
outs->utf8seq[1] = rbuf[rlen - 2];
|
||||||
|
outs->utf8seq[2] = rbuf[rlen - 1];
|
||||||
|
rlen -= 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
free(wc_buf);
|
|
||||||
|
if(rlen) {
|
||||||
|
/* calculate buffer size for wide characters */
|
||||||
|
wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, NULL, 0);
|
||||||
|
if(!wc_len)
|
||||||
|
return CURL_WRITEFUNC_ERROR;
|
||||||
|
|
||||||
|
wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t));
|
||||||
|
if(!wc_buf)
|
||||||
|
return CURL_WRITEFUNC_ERROR;
|
||||||
|
|
||||||
|
wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, wc_buf,
|
||||||
|
wc_len);
|
||||||
|
if(!wc_len) {
|
||||||
|
free(wc_buf);
|
||||||
|
return CURL_WRITEFUNC_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!WriteConsoleW(
|
||||||
|
(HANDLE) fhnd,
|
||||||
|
wc_buf,
|
||||||
|
wc_len,
|
||||||
|
NULL,
|
||||||
|
NULL)) {
|
||||||
|
free(wc_buf);
|
||||||
|
return CURL_WRITEFUNC_ERROR;
|
||||||
|
}
|
||||||
|
free(wc_buf);
|
||||||
|
}
|
||||||
|
|
||||||
rc = bytes;
|
rc = bytes;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|||||||
@ -464,6 +464,12 @@ static CURLcode post_per_transfer(struct GlobalConfig *global,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
/* Discard incomplete UTF-8 sequence buffered from body */
|
||||||
|
if(outs->utf8seq[0])
|
||||||
|
memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
|
||||||
|
#endif
|
||||||
|
|
||||||
/* if retry-max-time is non-zero, make sure we haven't exceeded the
|
/* if retry-max-time is non-zero, make sure we haven't exceeded the
|
||||||
time */
|
time */
|
||||||
if(per->retry_numretries &&
|
if(per->retry_numretries &&
|
||||||
|
|||||||
@ -57,6 +57,9 @@
|
|||||||
* 'init' member holds original file size or offset at which truncation is
|
* 'init' member holds original file size or offset at which truncation is
|
||||||
* taking place. Always zero unless appending to a non-empty regular file.
|
* taking place. Always zero unless appending to a non-empty regular file.
|
||||||
*
|
*
|
||||||
|
* [Windows]
|
||||||
|
* 'utf8seq' member holds an incomplete UTF-8 sequence destined for the console
|
||||||
|
* until it can be completed (1-4 bytes) + NUL.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct OutStruct {
|
struct OutStruct {
|
||||||
@ -68,6 +71,9 @@ struct OutStruct {
|
|||||||
FILE *stream;
|
FILE *stream;
|
||||||
curl_off_t bytes;
|
curl_off_t bytes;
|
||||||
curl_off_t init;
|
curl_off_t init;
|
||||||
|
#ifdef WIN32
|
||||||
|
unsigned char utf8seq[5];
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user