fs: use WTF-8 on Windows (#2970)

This allows working with valid filenames that are not well-formed
UTF-16. This is a superset of UTF-8, which does not error when it
encounters an unpaired surrogate but simply allows it.

Fixes: https://github.com/libuv/libuv/issues/2048
Refs: https://simonsapin.github.io/wtf-8/
Replaces: https://github.com/libuv/libuv/pull/2192 by Nikolai Vavilov <vvnicholas@gmail.com>
Co-authored-by: Jameson Nash <vtjnash@gmail.com>
This commit is contained in:
Stefan Karpinski 2023-05-23 10:25:38 -04:00 committed by GitHub
parent 6d0d4a3e4d
commit 8f32a14afa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 264 additions and 104 deletions

View File

@ -144,26 +144,97 @@ void uv__fs_init(void) {
}
static int32_t fs__decode_wtf8_char(const char** input) {
uint32_t code_point;
uint8_t b1;
uint8_t b2;
uint8_t b3;
uint8_t b4;
b1 = **input;
if (b1 <= 0x7F)
return b1; /* ASCII code point */
if (b1 < 0xC2)
return -1; /* invalid: continuation byte */
code_point = b1;
b2 = *++*input;
if ((b2 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b2 & 0x3F);
if (b1 <= 0xDF)
return 0x7FF & code_point; /* two-byte character */
b3 = *++*input;
if ((b3 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b3 & 0x3F);
if (b1 <= 0xEF)
return 0xFFFF & code_point; /* three-byte character */
b4 = *++*input;
if ((b4 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b4 & 0x3F);
if (b1 <= 0xF4)
if (code_point <= 0x10FFFF)
return code_point; /* four-byte character */
/* code point too large */
return -1;
}
static ssize_t fs__get_length_wtf8(const char* source_ptr) {
size_t w_target_len = 0;
int32_t code_point;
do {
code_point = fs__decode_wtf8_char(&source_ptr);
if (code_point < 0)
return -1;
if (code_point > 0xFFFF)
w_target_len++;
w_target_len++;
} while (*source_ptr++);
return w_target_len;
}
static void fs__wtf8_to_wide(const char* source_ptr, WCHAR* w_target) {
int32_t code_point;
do {
code_point = fs__decode_wtf8_char(&source_ptr);
/* fs__get_length_wtf8 should have been called and checked first. */
assert(code_point >= 0);
if (code_point > 0x10000) {
assert(code_point < 0x10FFFF);
*w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
*w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
} else {
*w_target++ = code_point;
}
} while (*source_ptr++);
}
INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
const char* new_path, const int copy_path) {
char* buf;
char* pos;
ssize_t buf_sz = 0, path_len = 0, pathw_len = 0, new_pathw_len = 0;
WCHAR* buf;
WCHAR* pos;
size_t buf_sz = 0;
size_t path_len = 0;
ssize_t pathw_len = 0;
ssize_t new_pathw_len = 0;
/* new_path can only be set if path is also set. */
assert(new_path == NULL || path != NULL);
if (path != NULL) {
pathw_len = MultiByteToWideChar(CP_UTF8,
0,
path,
-1,
NULL,
0);
if (pathw_len == 0) {
return GetLastError();
}
pathw_len = fs__get_length_wtf8(path);
if (pathw_len < 0)
return ERROR_INVALID_NAME;
buf_sz += pathw_len * sizeof(WCHAR);
}
@ -173,16 +244,9 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
}
if (new_path != NULL) {
new_pathw_len = MultiByteToWideChar(CP_UTF8,
0,
new_path,
-1,
NULL,
0);
if (new_pathw_len == 0) {
return GetLastError();
}
new_pathw_len = fs__get_length_wtf8(new_path);
if (new_pathw_len < 0)
return ERROR_INVALID_NAME;
buf_sz += new_pathw_len * sizeof(WCHAR);
}
@ -194,7 +258,7 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
return 0;
}
buf = (char*) uv__malloc(buf_sz);
buf = uv__malloc(buf_sz);
if (buf == NULL) {
return ERROR_OUTOFMEMORY;
}
@ -202,29 +266,17 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
pos = buf;
if (path != NULL) {
DWORD r = MultiByteToWideChar(CP_UTF8,
0,
path,
-1,
(WCHAR*) pos,
pathw_len);
assert(r == (DWORD) pathw_len);
req->file.pathw = (WCHAR*) pos;
pos += r * sizeof(WCHAR);
fs__wtf8_to_wide(path, pos);
req->file.pathw = pos;
pos += pathw_len;
} else {
req->file.pathw = NULL;
}
if (new_path != NULL) {
DWORD r = MultiByteToWideChar(CP_UTF8,
0,
new_path,
-1,
(WCHAR*) pos,
new_pathw_len);
assert(r == (DWORD) new_pathw_len);
req->fs.info.new_pathw = (WCHAR*) pos;
pos += r * sizeof(WCHAR);
fs__wtf8_to_wide(new_path, pos);
req->fs.info.new_pathw = pos;
pos += new_pathw_len;
} else {
req->fs.info.new_pathw = NULL;
}
@ -232,8 +284,8 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
req->path = path;
if (path != NULL && copy_path) {
memcpy(pos, path, path_len);
assert(path_len == buf_sz - (pos - buf));
req->path = pos;
assert(path_len == buf_sz - (pos - buf) * sizeof(WCHAR));
req->path = (char*) pos;
}
req->flags |= UV_FS_FREE_PATHS;
@ -259,57 +311,115 @@ INLINE static void uv__fs_req_init(uv_loop_t* loop, uv_fs_t* req,
}
static int fs__wide_to_utf8(WCHAR* w_source_ptr,
DWORD w_source_len,
char** target_ptr,
uint64_t* target_len_ptr) {
int r;
int target_len;
static int32_t fs__get_surrogate_value(const WCHAR* w_source_ptr,
size_t w_source_len) {
WCHAR u;
WCHAR next;
u = w_source_ptr[0];
if (u >= 0xD800 && u <= 0xDBFF && w_source_len > 1) {
next = w_source_ptr[1];
if (next >= 0xDC00 && next <= 0xDFFF)
return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
}
return u;
}
static size_t fs__get_length_wide(const WCHAR* w_source_ptr,
size_t w_source_len) {
size_t target_len;
int32_t code_point;
target_len = 0;
for (; w_source_len; w_source_len--, w_source_ptr++) {
code_point = fs__get_surrogate_value(w_source_ptr, w_source_len);
/* Can be invalid UTF-8 but must be valid WTF-8. */
assert(code_point >= 0);
if (code_point < 0x80)
target_len += 1;
else if (code_point < 0x800)
target_len += 2;
else if (code_point < 0x10000)
target_len += 3;
else {
target_len += 4;
w_source_ptr++;
w_source_len--;
}
}
return target_len;
}
static int fs__wide_to_wtf8(WCHAR* w_source_ptr,
size_t w_source_len,
char** target_ptr,
size_t* target_len_ptr) {
size_t target_len;
char* target;
target_len = WideCharToMultiByte(CP_UTF8,
0,
w_source_ptr,
w_source_len,
NULL,
0,
NULL,
NULL);
int32_t code_point;
if (target_len == 0) {
return -1;
/* If *target_ptr is provided, then *target_len_ptr must be its length
* (excluding space for null), otherwise we will compute the target_len_ptr
* length and may return a new allocation in *target_ptr if target_ptr is
* provided. */
if (target_ptr == NULL || *target_ptr == NULL) {
target_len = fs__get_length_wide(w_source_ptr, w_source_len);
if (target_len_ptr != NULL)
*target_len_ptr = target_len;
} else {
target_len = *target_len_ptr;
}
if (target_len_ptr != NULL) {
*target_len_ptr = target_len;
}
if (target_ptr == NULL) {
if (target_ptr == NULL)
return 0;
if (*target_ptr == NULL) {
target = uv__malloc(target_len + 1);
if (target == NULL) {
SetLastError(ERROR_OUTOFMEMORY);
return -1;
}
*target_ptr = target;
} else {
target = *target_ptr;
}
target = uv__malloc(target_len + 1);
if (target == NULL) {
SetLastError(ERROR_OUTOFMEMORY);
return -1;
}
for (; w_source_len; w_source_len--, w_source_ptr++) {
code_point = fs__get_surrogate_value(w_source_ptr, w_source_len);
/* Can be invalid UTF-8 but must be valid WTF-8. */
assert(code_point >= 0);
if (code_point < 0x80) {
*target++ = code_point;
} else if (code_point < 0x800) {
*target++ = 0xC0 | (code_point >> 6);
*target++ = 0x80 | (code_point & 0x3F);
} else if (code_point < 0x10000) {
*target++ = 0xE0 | (code_point >> 12);
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
*target++ = 0x80 | (code_point & 0x3F);
} else {
*target++ = 0xF0 | (code_point >> 18);
*target++ = 0x80 | ((code_point >> 12) & 0x3F);
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
*target++ = 0x80 | (code_point & 0x3F);
w_source_ptr++;
w_source_len--;
}
}
assert((size_t) (target - *target_ptr) == target_len);
*target++ = '\0';
r = WideCharToMultiByte(CP_UTF8,
0,
w_source_ptr,
w_source_len,
target,
target_len,
NULL,
NULL);
assert(r == target_len);
target[target_len] = '\0';
*target_ptr = target;
return 0;
}
INLINE static int fs__readlink_handle(HANDLE handle, char** target_ptr,
uint64_t* target_len_ptr) {
INLINE static int fs__readlink_handle(HANDLE handle,
char** target_ptr,
size_t* target_len_ptr) {
char buffer[MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
REPARSE_DATA_BUFFER* reparse_data = (REPARSE_DATA_BUFFER*) buffer;
WCHAR* w_target;
@ -439,7 +549,8 @@ INLINE static int fs__readlink_handle(HANDLE handle, char** target_ptr,
return -1;
}
return fs__wide_to_utf8(w_target, w_target_len, target_ptr, target_len_ptr);
assert(target_ptr == NULL || *target_ptr == NULL);
return fs__wide_to_wtf8(w_target, w_target_len, target_ptr, target_len_ptr);
}
@ -1429,7 +1540,8 @@ void fs__scandir(uv_fs_t* req) {
uv__dirent_t* dirent;
size_t wchar_len;
size_t utf8_len;
size_t wtf8_len;
char* wtf8;
/* Obtain a pointer to the current directory entry. */
position += next_entry_offset;
@ -1456,11 +1568,8 @@ void fs__scandir(uv_fs_t* req) {
info->FileName[1] == L'.')
continue;
/* Compute the space required to store the filename as UTF-8. */
utf8_len = WideCharToMultiByte(
CP_UTF8, 0, &info->FileName[0], wchar_len, NULL, 0, NULL, NULL);
if (utf8_len == 0)
goto win32_error;
/* Compute the space required to store the filename as WTF-8. */
wtf8_len = fs__get_length_wide(&info->FileName[0], wchar_len);
/* Resize the dirent array if needed. */
if (dirents_used >= dirents_size) {
@ -1480,26 +1589,17 @@ void fs__scandir(uv_fs_t* req) {
* includes room for the first character of the filename, but `utf8_len`
* doesn't count the NULL terminator at this point.
*/
dirent = uv__malloc(sizeof *dirent + utf8_len);
dirent = uv__malloc(sizeof *dirent + wtf8_len);
if (dirent == NULL)
goto out_of_memory_error;
dirents[dirents_used++] = dirent;
/* Convert file name to UTF-8. */
if (WideCharToMultiByte(CP_UTF8,
0,
&info->FileName[0],
wchar_len,
&dirent->d_name[0],
utf8_len,
NULL,
NULL) == 0)
wtf8 = &dirent->d_name[0];
if (fs__wide_to_wtf8(&info->FileName[0], wchar_len, &wtf8, &wtf8_len) == -1)
goto win32_error;
/* Add a null terminator to the filename. */
dirent->d_name[utf8_len] = '\0';
/* Fill out the type field. */
if (info->FileAttributes & FILE_ATTRIBUTE_DEVICE)
dirent->d_type = UV__DT_CHAR;
@ -1708,6 +1808,7 @@ void fs__closedir(uv_fs_t* req) {
INLINE static int fs__stat_handle(HANDLE handle, uv_stat_t* statbuf,
int do_lstat) {
size_t target_length = 0;
FILE_FS_DEVICE_INFORMATION device_info;
FILE_ALL_INFORMATION file_info;
FILE_FS_VOLUME_INFORMATION volume_info;
@ -1803,9 +1904,10 @@ INLINE static int fs__stat_handle(HANDLE handle, uv_stat_t* statbuf,
* to be treated as a regular file. The higher level lstat function will
* detect this failure and retry without do_lstat if appropriate.
*/
if (fs__readlink_handle(handle, NULL, &statbuf->st_size) != 0)
if (fs__readlink_handle(handle, NULL, &target_length) != 0)
return -1;
statbuf->st_mode |= S_IFLNK;
statbuf->st_size = target_length;
}
if (statbuf->st_mode == 0) {
@ -2661,6 +2763,7 @@ static void fs__readlink(uv_fs_t* req) {
return;
}
assert(req->ptr == NULL);
if (fs__readlink_handle(handle, (char**) &req->ptr, NULL) != 0) {
DWORD error = GetLastError();
SET_REQ_WIN32_ERROR(req, error);
@ -2720,7 +2823,8 @@ static ssize_t fs__realpath_handle(HANDLE handle, char** realpath_ptr) {
return -1;
}
r = fs__wide_to_utf8(w_realpath_ptr, w_realpath_len, realpath_ptr, NULL);
assert(*realpath_ptr == NULL);
r = fs__wide_to_wtf8(w_realpath_ptr, w_realpath_len, realpath_ptr, NULL);
uv__free(w_realpath_buf);
return r;
}
@ -2740,6 +2844,7 @@ static void fs__realpath(uv_fs_t* req) {
return;
}
assert(req->ptr == NULL);
if (fs__realpath_handle(handle, (char**) &req->ptr) == -1) {
CloseHandle(handle);
SET_REQ_WIN32_ERROR(req, GetLastError());

View File

@ -4550,6 +4550,7 @@ TEST_IMPL(fs_get_system_error) {
return 0;
}
TEST_IMPL(fs_stat_batch_multiple) {
uv_fs_t req[300];
int r;
@ -4573,3 +4574,55 @@ TEST_IMPL(fs_stat_batch_multiple) {
MAKE_VALGRIND_HAPPY(loop);
return 0;
}
#ifdef _WIN32
TEST_IMPL(fs_wtf) {
int r;
HANDLE file_handle;
uv_dirent_t dent;
static char test_file_buf[PATHMAX];
/* set-up */
_wunlink(L"test_dir/hi\xD801\x0037");
rmdir("test_dir");
loop = uv_default_loop();
r = uv_fs_mkdir(NULL, &mkdir_req, "test_dir", 0777, NULL);
ASSERT_EQ(r, 0);
uv_fs_req_cleanup(&mkdir_req);
file_handle = CreateFileW(L"test_dir/hi\xD801\x0037",
GENERIC_WRITE | FILE_WRITE_ATTRIBUTES,
0,
NULL,
CREATE_ALWAYS,
FILE_FLAG_OPEN_REPARSE_POINT |
FILE_FLAG_BACKUP_SEMANTICS,
NULL);
ASSERT(file_handle != INVALID_HANDLE_VALUE);
CloseHandle(file_handle);
r = uv_fs_scandir(NULL, &scandir_req, "test_dir", 0, NULL);
ASSERT_EQ(r, 1);
ASSERT_EQ(scandir_req.result, 1);
ASSERT_NOT_NULL(scandir_req.ptr);
while (UV_EOF != uv_fs_scandir_next(&scandir_req, &dent)) {
snprintf(test_file_buf, sizeof(test_file_buf), "test_dir\\%s", dent.name);
printf("stat %s\n", test_file_buf);
r = uv_fs_stat(NULL, &stat_req, test_file_buf, NULL);
ASSERT_EQ(r, 0);
}
uv_fs_req_cleanup(&scandir_req);
ASSERT_NULL(scandir_req.ptr);
/* clean-up */
_wunlink(L"test_dir/hi\xD801\x0037");
rmdir("test_dir");
MAKE_VALGRIND_HAPPY(loop);
return 0;
}
#endif

View File

@ -437,6 +437,7 @@ TEST_DECLARE (fs_file_flag_no_buffering)
TEST_DECLARE (fs_open_readonly_acl)
TEST_DECLARE (fs_fchmod_archive_readonly)
TEST_DECLARE (fs_invalid_mkdir_name)
TEST_DECLARE (fs_wtf)
#endif
TEST_DECLARE (fs_get_system_error)
TEST_DECLARE (strscpy)
@ -1120,6 +1121,7 @@ TASK_LIST_START
TEST_ENTRY (fs_open_readonly_acl)
TEST_ENTRY (fs_fchmod_archive_readonly)
TEST_ENTRY (fs_invalid_mkdir_name)
TEST_ENTRY (fs_wtf)
#endif
TEST_ENTRY (fs_get_system_error)
TEST_ENTRY (get_osfhandle_valid_handle)