fs: use WTF-8 on Windows (#2970)
This allows working with valid filenames that are not well-formed UTF-16. This is a superset of UTF-8, which does not error when it encounters an unpaired surrogate but simply allows it. Fixes: https://github.com/libuv/libuv/issues/2048 Refs: https://simonsapin.github.io/wtf-8/ Replaces: https://github.com/libuv/libuv/pull/2192 by Nikolai Vavilov <vvnicholas@gmail.com> Co-authored-by: Jameson Nash <vtjnash@gmail.com>
This commit is contained in:
parent
6d0d4a3e4d
commit
8f32a14afa
313
src/win/fs.c
313
src/win/fs.c
@ -144,26 +144,97 @@ void uv__fs_init(void) {
|
||||
}
|
||||
|
||||
|
||||
static int32_t fs__decode_wtf8_char(const char** input) {
|
||||
uint32_t code_point;
|
||||
uint8_t b1;
|
||||
uint8_t b2;
|
||||
uint8_t b3;
|
||||
uint8_t b4;
|
||||
|
||||
b1 = **input;
|
||||
if (b1 <= 0x7F)
|
||||
return b1; /* ASCII code point */
|
||||
if (b1 < 0xC2)
|
||||
return -1; /* invalid: continuation byte */
|
||||
code_point = b1;
|
||||
|
||||
b2 = *++*input;
|
||||
if ((b2 & 0xC0) != 0x80)
|
||||
return -1; /* invalid: not a continuation byte */
|
||||
code_point = (code_point << 6) | (b2 & 0x3F);
|
||||
if (b1 <= 0xDF)
|
||||
return 0x7FF & code_point; /* two-byte character */
|
||||
|
||||
b3 = *++*input;
|
||||
if ((b3 & 0xC0) != 0x80)
|
||||
return -1; /* invalid: not a continuation byte */
|
||||
code_point = (code_point << 6) | (b3 & 0x3F);
|
||||
if (b1 <= 0xEF)
|
||||
return 0xFFFF & code_point; /* three-byte character */
|
||||
|
||||
b4 = *++*input;
|
||||
if ((b4 & 0xC0) != 0x80)
|
||||
return -1; /* invalid: not a continuation byte */
|
||||
code_point = (code_point << 6) | (b4 & 0x3F);
|
||||
if (b1 <= 0xF4)
|
||||
if (code_point <= 0x10FFFF)
|
||||
return code_point; /* four-byte character */
|
||||
|
||||
/* code point too large */
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
static ssize_t fs__get_length_wtf8(const char* source_ptr) {
|
||||
size_t w_target_len = 0;
|
||||
int32_t code_point;
|
||||
|
||||
do {
|
||||
code_point = fs__decode_wtf8_char(&source_ptr);
|
||||
if (code_point < 0)
|
||||
return -1;
|
||||
if (code_point > 0xFFFF)
|
||||
w_target_len++;
|
||||
w_target_len++;
|
||||
} while (*source_ptr++);
|
||||
return w_target_len;
|
||||
}
|
||||
|
||||
|
||||
static void fs__wtf8_to_wide(const char* source_ptr, WCHAR* w_target) {
|
||||
int32_t code_point;
|
||||
|
||||
do {
|
||||
code_point = fs__decode_wtf8_char(&source_ptr);
|
||||
/* fs__get_length_wtf8 should have been called and checked first. */
|
||||
assert(code_point >= 0);
|
||||
if (code_point > 0x10000) {
|
||||
assert(code_point < 0x10FFFF);
|
||||
*w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
|
||||
*w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
|
||||
} else {
|
||||
*w_target++ = code_point;
|
||||
}
|
||||
} while (*source_ptr++);
|
||||
}
|
||||
|
||||
|
||||
INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
|
||||
const char* new_path, const int copy_path) {
|
||||
char* buf;
|
||||
char* pos;
|
||||
ssize_t buf_sz = 0, path_len = 0, pathw_len = 0, new_pathw_len = 0;
|
||||
WCHAR* buf;
|
||||
WCHAR* pos;
|
||||
size_t buf_sz = 0;
|
||||
size_t path_len = 0;
|
||||
ssize_t pathw_len = 0;
|
||||
ssize_t new_pathw_len = 0;
|
||||
|
||||
/* new_path can only be set if path is also set. */
|
||||
assert(new_path == NULL || path != NULL);
|
||||
|
||||
if (path != NULL) {
|
||||
pathw_len = MultiByteToWideChar(CP_UTF8,
|
||||
0,
|
||||
path,
|
||||
-1,
|
||||
NULL,
|
||||
0);
|
||||
if (pathw_len == 0) {
|
||||
return GetLastError();
|
||||
}
|
||||
|
||||
pathw_len = fs__get_length_wtf8(path);
|
||||
if (pathw_len < 0)
|
||||
return ERROR_INVALID_NAME;
|
||||
buf_sz += pathw_len * sizeof(WCHAR);
|
||||
}
|
||||
|
||||
@ -173,16 +244,9 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
|
||||
}
|
||||
|
||||
if (new_path != NULL) {
|
||||
new_pathw_len = MultiByteToWideChar(CP_UTF8,
|
||||
0,
|
||||
new_path,
|
||||
-1,
|
||||
NULL,
|
||||
0);
|
||||
if (new_pathw_len == 0) {
|
||||
return GetLastError();
|
||||
}
|
||||
|
||||
new_pathw_len = fs__get_length_wtf8(new_path);
|
||||
if (new_pathw_len < 0)
|
||||
return ERROR_INVALID_NAME;
|
||||
buf_sz += new_pathw_len * sizeof(WCHAR);
|
||||
}
|
||||
|
||||
@ -194,7 +258,7 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
|
||||
return 0;
|
||||
}
|
||||
|
||||
buf = (char*) uv__malloc(buf_sz);
|
||||
buf = uv__malloc(buf_sz);
|
||||
if (buf == NULL) {
|
||||
return ERROR_OUTOFMEMORY;
|
||||
}
|
||||
@ -202,29 +266,17 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
|
||||
pos = buf;
|
||||
|
||||
if (path != NULL) {
|
||||
DWORD r = MultiByteToWideChar(CP_UTF8,
|
||||
0,
|
||||
path,
|
||||
-1,
|
||||
(WCHAR*) pos,
|
||||
pathw_len);
|
||||
assert(r == (DWORD) pathw_len);
|
||||
req->file.pathw = (WCHAR*) pos;
|
||||
pos += r * sizeof(WCHAR);
|
||||
fs__wtf8_to_wide(path, pos);
|
||||
req->file.pathw = pos;
|
||||
pos += pathw_len;
|
||||
} else {
|
||||
req->file.pathw = NULL;
|
||||
}
|
||||
|
||||
if (new_path != NULL) {
|
||||
DWORD r = MultiByteToWideChar(CP_UTF8,
|
||||
0,
|
||||
new_path,
|
||||
-1,
|
||||
(WCHAR*) pos,
|
||||
new_pathw_len);
|
||||
assert(r == (DWORD) new_pathw_len);
|
||||
req->fs.info.new_pathw = (WCHAR*) pos;
|
||||
pos += r * sizeof(WCHAR);
|
||||
fs__wtf8_to_wide(new_path, pos);
|
||||
req->fs.info.new_pathw = pos;
|
||||
pos += new_pathw_len;
|
||||
} else {
|
||||
req->fs.info.new_pathw = NULL;
|
||||
}
|
||||
@ -232,8 +284,8 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path,
|
||||
req->path = path;
|
||||
if (path != NULL && copy_path) {
|
||||
memcpy(pos, path, path_len);
|
||||
assert(path_len == buf_sz - (pos - buf));
|
||||
req->path = pos;
|
||||
assert(path_len == buf_sz - (pos - buf) * sizeof(WCHAR));
|
||||
req->path = (char*) pos;
|
||||
}
|
||||
|
||||
req->flags |= UV_FS_FREE_PATHS;
|
||||
@ -259,57 +311,115 @@ INLINE static void uv__fs_req_init(uv_loop_t* loop, uv_fs_t* req,
|
||||
}
|
||||
|
||||
|
||||
static int fs__wide_to_utf8(WCHAR* w_source_ptr,
|
||||
DWORD w_source_len,
|
||||
char** target_ptr,
|
||||
uint64_t* target_len_ptr) {
|
||||
int r;
|
||||
int target_len;
|
||||
static int32_t fs__get_surrogate_value(const WCHAR* w_source_ptr,
|
||||
size_t w_source_len) {
|
||||
WCHAR u;
|
||||
WCHAR next;
|
||||
|
||||
u = w_source_ptr[0];
|
||||
if (u >= 0xD800 && u <= 0xDBFF && w_source_len > 1) {
|
||||
next = w_source_ptr[1];
|
||||
if (next >= 0xDC00 && next <= 0xDFFF)
|
||||
return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
|
||||
}
|
||||
return u;
|
||||
}
|
||||
|
||||
|
||||
static size_t fs__get_length_wide(const WCHAR* w_source_ptr,
|
||||
size_t w_source_len) {
|
||||
size_t target_len;
|
||||
int32_t code_point;
|
||||
|
||||
target_len = 0;
|
||||
for (; w_source_len; w_source_len--, w_source_ptr++) {
|
||||
code_point = fs__get_surrogate_value(w_source_ptr, w_source_len);
|
||||
/* Can be invalid UTF-8 but must be valid WTF-8. */
|
||||
assert(code_point >= 0);
|
||||
if (code_point < 0x80)
|
||||
target_len += 1;
|
||||
else if (code_point < 0x800)
|
||||
target_len += 2;
|
||||
else if (code_point < 0x10000)
|
||||
target_len += 3;
|
||||
else {
|
||||
target_len += 4;
|
||||
w_source_ptr++;
|
||||
w_source_len--;
|
||||
}
|
||||
}
|
||||
return target_len;
|
||||
}
|
||||
|
||||
|
||||
static int fs__wide_to_wtf8(WCHAR* w_source_ptr,
|
||||
size_t w_source_len,
|
||||
char** target_ptr,
|
||||
size_t* target_len_ptr) {
|
||||
size_t target_len;
|
||||
char* target;
|
||||
target_len = WideCharToMultiByte(CP_UTF8,
|
||||
0,
|
||||
w_source_ptr,
|
||||
w_source_len,
|
||||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL);
|
||||
int32_t code_point;
|
||||
|
||||
if (target_len == 0) {
|
||||
return -1;
|
||||
/* If *target_ptr is provided, then *target_len_ptr must be its length
|
||||
* (excluding space for null), otherwise we will compute the target_len_ptr
|
||||
* length and may return a new allocation in *target_ptr if target_ptr is
|
||||
* provided. */
|
||||
if (target_ptr == NULL || *target_ptr == NULL) {
|
||||
target_len = fs__get_length_wide(w_source_ptr, w_source_len);
|
||||
if (target_len_ptr != NULL)
|
||||
*target_len_ptr = target_len;
|
||||
} else {
|
||||
target_len = *target_len_ptr;
|
||||
}
|
||||
|
||||
if (target_len_ptr != NULL) {
|
||||
*target_len_ptr = target_len;
|
||||
}
|
||||
|
||||
if (target_ptr == NULL) {
|
||||
if (target_ptr == NULL)
|
||||
return 0;
|
||||
|
||||
if (*target_ptr == NULL) {
|
||||
target = uv__malloc(target_len + 1);
|
||||
if (target == NULL) {
|
||||
SetLastError(ERROR_OUTOFMEMORY);
|
||||
return -1;
|
||||
}
|
||||
*target_ptr = target;
|
||||
} else {
|
||||
target = *target_ptr;
|
||||
}
|
||||
|
||||
target = uv__malloc(target_len + 1);
|
||||
if (target == NULL) {
|
||||
SetLastError(ERROR_OUTOFMEMORY);
|
||||
return -1;
|
||||
}
|
||||
for (; w_source_len; w_source_len--, w_source_ptr++) {
|
||||
code_point = fs__get_surrogate_value(w_source_ptr, w_source_len);
|
||||
/* Can be invalid UTF-8 but must be valid WTF-8. */
|
||||
assert(code_point >= 0);
|
||||
|
||||
if (code_point < 0x80) {
|
||||
*target++ = code_point;
|
||||
} else if (code_point < 0x800) {
|
||||
*target++ = 0xC0 | (code_point >> 6);
|
||||
*target++ = 0x80 | (code_point & 0x3F);
|
||||
} else if (code_point < 0x10000) {
|
||||
*target++ = 0xE0 | (code_point >> 12);
|
||||
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
|
||||
*target++ = 0x80 | (code_point & 0x3F);
|
||||
} else {
|
||||
*target++ = 0xF0 | (code_point >> 18);
|
||||
*target++ = 0x80 | ((code_point >> 12) & 0x3F);
|
||||
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
|
||||
*target++ = 0x80 | (code_point & 0x3F);
|
||||
w_source_ptr++;
|
||||
w_source_len--;
|
||||
}
|
||||
}
|
||||
assert((size_t) (target - *target_ptr) == target_len);
|
||||
|
||||
*target++ = '\0';
|
||||
|
||||
r = WideCharToMultiByte(CP_UTF8,
|
||||
0,
|
||||
w_source_ptr,
|
||||
w_source_len,
|
||||
target,
|
||||
target_len,
|
||||
NULL,
|
||||
NULL);
|
||||
assert(r == target_len);
|
||||
target[target_len] = '\0';
|
||||
*target_ptr = target;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
INLINE static int fs__readlink_handle(HANDLE handle, char** target_ptr,
|
||||
uint64_t* target_len_ptr) {
|
||||
INLINE static int fs__readlink_handle(HANDLE handle,
|
||||
char** target_ptr,
|
||||
size_t* target_len_ptr) {
|
||||
char buffer[MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
|
||||
REPARSE_DATA_BUFFER* reparse_data = (REPARSE_DATA_BUFFER*) buffer;
|
||||
WCHAR* w_target;
|
||||
@ -439,7 +549,8 @@ INLINE static int fs__readlink_handle(HANDLE handle, char** target_ptr,
|
||||
return -1;
|
||||
}
|
||||
|
||||
return fs__wide_to_utf8(w_target, w_target_len, target_ptr, target_len_ptr);
|
||||
assert(target_ptr == NULL || *target_ptr == NULL);
|
||||
return fs__wide_to_wtf8(w_target, w_target_len, target_ptr, target_len_ptr);
|
||||
}
|
||||
|
||||
|
||||
@ -1429,7 +1540,8 @@ void fs__scandir(uv_fs_t* req) {
|
||||
uv__dirent_t* dirent;
|
||||
|
||||
size_t wchar_len;
|
||||
size_t utf8_len;
|
||||
size_t wtf8_len;
|
||||
char* wtf8;
|
||||
|
||||
/* Obtain a pointer to the current directory entry. */
|
||||
position += next_entry_offset;
|
||||
@ -1456,11 +1568,8 @@ void fs__scandir(uv_fs_t* req) {
|
||||
info->FileName[1] == L'.')
|
||||
continue;
|
||||
|
||||
/* Compute the space required to store the filename as UTF-8. */
|
||||
utf8_len = WideCharToMultiByte(
|
||||
CP_UTF8, 0, &info->FileName[0], wchar_len, NULL, 0, NULL, NULL);
|
||||
if (utf8_len == 0)
|
||||
goto win32_error;
|
||||
/* Compute the space required to store the filename as WTF-8. */
|
||||
wtf8_len = fs__get_length_wide(&info->FileName[0], wchar_len);
|
||||
|
||||
/* Resize the dirent array if needed. */
|
||||
if (dirents_used >= dirents_size) {
|
||||
@ -1480,26 +1589,17 @@ void fs__scandir(uv_fs_t* req) {
|
||||
* includes room for the first character of the filename, but `utf8_len`
|
||||
* doesn't count the NULL terminator at this point.
|
||||
*/
|
||||
dirent = uv__malloc(sizeof *dirent + utf8_len);
|
||||
dirent = uv__malloc(sizeof *dirent + wtf8_len);
|
||||
if (dirent == NULL)
|
||||
goto out_of_memory_error;
|
||||
|
||||
dirents[dirents_used++] = dirent;
|
||||
|
||||
/* Convert file name to UTF-8. */
|
||||
if (WideCharToMultiByte(CP_UTF8,
|
||||
0,
|
||||
&info->FileName[0],
|
||||
wchar_len,
|
||||
&dirent->d_name[0],
|
||||
utf8_len,
|
||||
NULL,
|
||||
NULL) == 0)
|
||||
wtf8 = &dirent->d_name[0];
|
||||
if (fs__wide_to_wtf8(&info->FileName[0], wchar_len, &wtf8, &wtf8_len) == -1)
|
||||
goto win32_error;
|
||||
|
||||
/* Add a null terminator to the filename. */
|
||||
dirent->d_name[utf8_len] = '\0';
|
||||
|
||||
/* Fill out the type field. */
|
||||
if (info->FileAttributes & FILE_ATTRIBUTE_DEVICE)
|
||||
dirent->d_type = UV__DT_CHAR;
|
||||
@ -1708,6 +1808,7 @@ void fs__closedir(uv_fs_t* req) {
|
||||
|
||||
INLINE static int fs__stat_handle(HANDLE handle, uv_stat_t* statbuf,
|
||||
int do_lstat) {
|
||||
size_t target_length = 0;
|
||||
FILE_FS_DEVICE_INFORMATION device_info;
|
||||
FILE_ALL_INFORMATION file_info;
|
||||
FILE_FS_VOLUME_INFORMATION volume_info;
|
||||
@ -1803,9 +1904,10 @@ INLINE static int fs__stat_handle(HANDLE handle, uv_stat_t* statbuf,
|
||||
* to be treated as a regular file. The higher level lstat function will
|
||||
* detect this failure and retry without do_lstat if appropriate.
|
||||
*/
|
||||
if (fs__readlink_handle(handle, NULL, &statbuf->st_size) != 0)
|
||||
if (fs__readlink_handle(handle, NULL, &target_length) != 0)
|
||||
return -1;
|
||||
statbuf->st_mode |= S_IFLNK;
|
||||
statbuf->st_size = target_length;
|
||||
}
|
||||
|
||||
if (statbuf->st_mode == 0) {
|
||||
@ -2661,6 +2763,7 @@ static void fs__readlink(uv_fs_t* req) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(req->ptr == NULL);
|
||||
if (fs__readlink_handle(handle, (char**) &req->ptr, NULL) != 0) {
|
||||
DWORD error = GetLastError();
|
||||
SET_REQ_WIN32_ERROR(req, error);
|
||||
@ -2720,7 +2823,8 @@ static ssize_t fs__realpath_handle(HANDLE handle, char** realpath_ptr) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
r = fs__wide_to_utf8(w_realpath_ptr, w_realpath_len, realpath_ptr, NULL);
|
||||
assert(*realpath_ptr == NULL);
|
||||
r = fs__wide_to_wtf8(w_realpath_ptr, w_realpath_len, realpath_ptr, NULL);
|
||||
uv__free(w_realpath_buf);
|
||||
return r;
|
||||
}
|
||||
@ -2740,6 +2844,7 @@ static void fs__realpath(uv_fs_t* req) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(req->ptr == NULL);
|
||||
if (fs__realpath_handle(handle, (char**) &req->ptr) == -1) {
|
||||
CloseHandle(handle);
|
||||
SET_REQ_WIN32_ERROR(req, GetLastError());
|
||||
|
||||
@ -4550,6 +4550,7 @@ TEST_IMPL(fs_get_system_error) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
TEST_IMPL(fs_stat_batch_multiple) {
|
||||
uv_fs_t req[300];
|
||||
int r;
|
||||
@ -4573,3 +4574,55 @@ TEST_IMPL(fs_stat_batch_multiple) {
|
||||
MAKE_VALGRIND_HAPPY(loop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef _WIN32
|
||||
TEST_IMPL(fs_wtf) {
|
||||
int r;
|
||||
HANDLE file_handle;
|
||||
uv_dirent_t dent;
|
||||
static char test_file_buf[PATHMAX];
|
||||
|
||||
/* set-up */
|
||||
_wunlink(L"test_dir/hi\xD801\x0037");
|
||||
rmdir("test_dir");
|
||||
|
||||
loop = uv_default_loop();
|
||||
|
||||
r = uv_fs_mkdir(NULL, &mkdir_req, "test_dir", 0777, NULL);
|
||||
ASSERT_EQ(r, 0);
|
||||
uv_fs_req_cleanup(&mkdir_req);
|
||||
|
||||
file_handle = CreateFileW(L"test_dir/hi\xD801\x0037",
|
||||
GENERIC_WRITE | FILE_WRITE_ATTRIBUTES,
|
||||
0,
|
||||
NULL,
|
||||
CREATE_ALWAYS,
|
||||
FILE_FLAG_OPEN_REPARSE_POINT |
|
||||
FILE_FLAG_BACKUP_SEMANTICS,
|
||||
NULL);
|
||||
ASSERT(file_handle != INVALID_HANDLE_VALUE);
|
||||
|
||||
CloseHandle(file_handle);
|
||||
|
||||
r = uv_fs_scandir(NULL, &scandir_req, "test_dir", 0, NULL);
|
||||
ASSERT_EQ(r, 1);
|
||||
ASSERT_EQ(scandir_req.result, 1);
|
||||
ASSERT_NOT_NULL(scandir_req.ptr);
|
||||
while (UV_EOF != uv_fs_scandir_next(&scandir_req, &dent)) {
|
||||
snprintf(test_file_buf, sizeof(test_file_buf), "test_dir\\%s", dent.name);
|
||||
printf("stat %s\n", test_file_buf);
|
||||
r = uv_fs_stat(NULL, &stat_req, test_file_buf, NULL);
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
uv_fs_req_cleanup(&scandir_req);
|
||||
ASSERT_NULL(scandir_req.ptr);
|
||||
|
||||
/* clean-up */
|
||||
_wunlink(L"test_dir/hi\xD801\x0037");
|
||||
rmdir("test_dir");
|
||||
|
||||
MAKE_VALGRIND_HAPPY(loop);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -437,6 +437,7 @@ TEST_DECLARE (fs_file_flag_no_buffering)
|
||||
TEST_DECLARE (fs_open_readonly_acl)
|
||||
TEST_DECLARE (fs_fchmod_archive_readonly)
|
||||
TEST_DECLARE (fs_invalid_mkdir_name)
|
||||
TEST_DECLARE (fs_wtf)
|
||||
#endif
|
||||
TEST_DECLARE (fs_get_system_error)
|
||||
TEST_DECLARE (strscpy)
|
||||
@ -1120,6 +1121,7 @@ TASK_LIST_START
|
||||
TEST_ENTRY (fs_open_readonly_acl)
|
||||
TEST_ENTRY (fs_fchmod_archive_readonly)
|
||||
TEST_ENTRY (fs_invalid_mkdir_name)
|
||||
TEST_ENTRY (fs_wtf)
|
||||
#endif
|
||||
TEST_ENTRY (fs_get_system_error)
|
||||
TEST_ENTRY (get_osfhandle_valid_handle)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user