From 8f32a14afaaa47514a7d28e1e069a8329e2dd939 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Tue, 23 May 2023 10:25:38 -0400 Subject: [PATCH] fs: use WTF-8 on Windows (#2970) This allows working with valid filenames that are not well-formed UTF-16. This is a superset of UTF-8, which does not error when it encounters an unpaired surrogate but simply allows it. Fixes: https://github.com/libuv/libuv/issues/2048 Refs: https://simonsapin.github.io/wtf-8/ Replaces: https://github.com/libuv/libuv/pull/2192 by Nikolai Vavilov Co-authored-by: Jameson Nash --- src/win/fs.c | 313 +++++++++++++++++++++++++++++++---------------- test/test-fs.c | 53 ++++++++ test/test-list.h | 2 + 3 files changed, 264 insertions(+), 104 deletions(-) diff --git a/src/win/fs.c b/src/win/fs.c index 2fc7481f..9d0614f2 100644 --- a/src/win/fs.c +++ b/src/win/fs.c @@ -144,26 +144,97 @@ void uv__fs_init(void) { } +static int32_t fs__decode_wtf8_char(const char** input) { + uint32_t code_point; + uint8_t b1; + uint8_t b2; + uint8_t b3; + uint8_t b4; + + b1 = **input; + if (b1 <= 0x7F) + return b1; /* ASCII code point */ + if (b1 < 0xC2) + return -1; /* invalid: continuation byte */ + code_point = b1; + + b2 = *++*input; + if ((b2 & 0xC0) != 0x80) + return -1; /* invalid: not a continuation byte */ + code_point = (code_point << 6) | (b2 & 0x3F); + if (b1 <= 0xDF) + return 0x7FF & code_point; /* two-byte character */ + + b3 = *++*input; + if ((b3 & 0xC0) != 0x80) + return -1; /* invalid: not a continuation byte */ + code_point = (code_point << 6) | (b3 & 0x3F); + if (b1 <= 0xEF) + return 0xFFFF & code_point; /* three-byte character */ + + b4 = *++*input; + if ((b4 & 0xC0) != 0x80) + return -1; /* invalid: not a continuation byte */ + code_point = (code_point << 6) | (b4 & 0x3F); + if (b1 <= 0xF4) + if (code_point <= 0x10FFFF) + return code_point; /* four-byte character */ + + /* code point too large */ + return -1; +} + + +static ssize_t fs__get_length_wtf8(const char* source_ptr) { + size_t w_target_len = 0; + int32_t code_point; + + do { + code_point = fs__decode_wtf8_char(&source_ptr); + if (code_point < 0) + return -1; + if (code_point > 0xFFFF) + w_target_len++; + w_target_len++; + } while (*source_ptr++); + return w_target_len; +} + + +static void fs__wtf8_to_wide(const char* source_ptr, WCHAR* w_target) { + int32_t code_point; + + do { + code_point = fs__decode_wtf8_char(&source_ptr); + /* fs__get_length_wtf8 should have been called and checked first. */ + assert(code_point >= 0); + if (code_point > 0x10000) { + assert(code_point < 0x10FFFF); + *w_target++ = (((code_point - 0x10000) >> 10) + 0xD800); + *w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00; + } else { + *w_target++ = code_point; + } + } while (*source_ptr++); +} + + INLINE static int fs__capture_path(uv_fs_t* req, const char* path, const char* new_path, const int copy_path) { - char* buf; - char* pos; - ssize_t buf_sz = 0, path_len = 0, pathw_len = 0, new_pathw_len = 0; + WCHAR* buf; + WCHAR* pos; + size_t buf_sz = 0; + size_t path_len = 0; + ssize_t pathw_len = 0; + ssize_t new_pathw_len = 0; /* new_path can only be set if path is also set. */ assert(new_path == NULL || path != NULL); if (path != NULL) { - pathw_len = MultiByteToWideChar(CP_UTF8, - 0, - path, - -1, - NULL, - 0); - if (pathw_len == 0) { - return GetLastError(); - } - + pathw_len = fs__get_length_wtf8(path); + if (pathw_len < 0) + return ERROR_INVALID_NAME; buf_sz += pathw_len * sizeof(WCHAR); } @@ -173,16 +244,9 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path, } if (new_path != NULL) { - new_pathw_len = MultiByteToWideChar(CP_UTF8, - 0, - new_path, - -1, - NULL, - 0); - if (new_pathw_len == 0) { - return GetLastError(); - } - + new_pathw_len = fs__get_length_wtf8(new_path); + if (new_pathw_len < 0) + return ERROR_INVALID_NAME; buf_sz += new_pathw_len * sizeof(WCHAR); } @@ -194,7 +258,7 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path, return 0; } - buf = (char*) uv__malloc(buf_sz); + buf = uv__malloc(buf_sz); if (buf == NULL) { return ERROR_OUTOFMEMORY; } @@ -202,29 +266,17 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path, pos = buf; if (path != NULL) { - DWORD r = MultiByteToWideChar(CP_UTF8, - 0, - path, - -1, - (WCHAR*) pos, - pathw_len); - assert(r == (DWORD) pathw_len); - req->file.pathw = (WCHAR*) pos; - pos += r * sizeof(WCHAR); + fs__wtf8_to_wide(path, pos); + req->file.pathw = pos; + pos += pathw_len; } else { req->file.pathw = NULL; } if (new_path != NULL) { - DWORD r = MultiByteToWideChar(CP_UTF8, - 0, - new_path, - -1, - (WCHAR*) pos, - new_pathw_len); - assert(r == (DWORD) new_pathw_len); - req->fs.info.new_pathw = (WCHAR*) pos; - pos += r * sizeof(WCHAR); + fs__wtf8_to_wide(new_path, pos); + req->fs.info.new_pathw = pos; + pos += new_pathw_len; } else { req->fs.info.new_pathw = NULL; } @@ -232,8 +284,8 @@ INLINE static int fs__capture_path(uv_fs_t* req, const char* path, req->path = path; if (path != NULL && copy_path) { memcpy(pos, path, path_len); - assert(path_len == buf_sz - (pos - buf)); - req->path = pos; + assert(path_len == buf_sz - (pos - buf) * sizeof(WCHAR)); + req->path = (char*) pos; } req->flags |= UV_FS_FREE_PATHS; @@ -259,57 +311,115 @@ INLINE static void uv__fs_req_init(uv_loop_t* loop, uv_fs_t* req, } -static int fs__wide_to_utf8(WCHAR* w_source_ptr, - DWORD w_source_len, - char** target_ptr, - uint64_t* target_len_ptr) { - int r; - int target_len; +static int32_t fs__get_surrogate_value(const WCHAR* w_source_ptr, + size_t w_source_len) { + WCHAR u; + WCHAR next; + + u = w_source_ptr[0]; + if (u >= 0xD800 && u <= 0xDBFF && w_source_len > 1) { + next = w_source_ptr[1]; + if (next >= 0xDC00 && next <= 0xDFFF) + return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00); + } + return u; +} + + +static size_t fs__get_length_wide(const WCHAR* w_source_ptr, + size_t w_source_len) { + size_t target_len; + int32_t code_point; + + target_len = 0; + for (; w_source_len; w_source_len--, w_source_ptr++) { + code_point = fs__get_surrogate_value(w_source_ptr, w_source_len); + /* Can be invalid UTF-8 but must be valid WTF-8. */ + assert(code_point >= 0); + if (code_point < 0x80) + target_len += 1; + else if (code_point < 0x800) + target_len += 2; + else if (code_point < 0x10000) + target_len += 3; + else { + target_len += 4; + w_source_ptr++; + w_source_len--; + } + } + return target_len; +} + + +static int fs__wide_to_wtf8(WCHAR* w_source_ptr, + size_t w_source_len, + char** target_ptr, + size_t* target_len_ptr) { + size_t target_len; char* target; - target_len = WideCharToMultiByte(CP_UTF8, - 0, - w_source_ptr, - w_source_len, - NULL, - 0, - NULL, - NULL); + int32_t code_point; - if (target_len == 0) { - return -1; + /* If *target_ptr is provided, then *target_len_ptr must be its length + * (excluding space for null), otherwise we will compute the target_len_ptr + * length and may return a new allocation in *target_ptr if target_ptr is + * provided. */ + if (target_ptr == NULL || *target_ptr == NULL) { + target_len = fs__get_length_wide(w_source_ptr, w_source_len); + if (target_len_ptr != NULL) + *target_len_ptr = target_len; + } else { + target_len = *target_len_ptr; } - if (target_len_ptr != NULL) { - *target_len_ptr = target_len; - } - - if (target_ptr == NULL) { + if (target_ptr == NULL) return 0; + + if (*target_ptr == NULL) { + target = uv__malloc(target_len + 1); + if (target == NULL) { + SetLastError(ERROR_OUTOFMEMORY); + return -1; + } + *target_ptr = target; + } else { + target = *target_ptr; } - target = uv__malloc(target_len + 1); - if (target == NULL) { - SetLastError(ERROR_OUTOFMEMORY); - return -1; - } + for (; w_source_len; w_source_len--, w_source_ptr++) { + code_point = fs__get_surrogate_value(w_source_ptr, w_source_len); + /* Can be invalid UTF-8 but must be valid WTF-8. */ + assert(code_point >= 0); + + if (code_point < 0x80) { + *target++ = code_point; + } else if (code_point < 0x800) { + *target++ = 0xC0 | (code_point >> 6); + *target++ = 0x80 | (code_point & 0x3F); + } else if (code_point < 0x10000) { + *target++ = 0xE0 | (code_point >> 12); + *target++ = 0x80 | ((code_point >> 6) & 0x3F); + *target++ = 0x80 | (code_point & 0x3F); + } else { + *target++ = 0xF0 | (code_point >> 18); + *target++ = 0x80 | ((code_point >> 12) & 0x3F); + *target++ = 0x80 | ((code_point >> 6) & 0x3F); + *target++ = 0x80 | (code_point & 0x3F); + w_source_ptr++; + w_source_len--; + } + } + assert((size_t) (target - *target_ptr) == target_len); + + *target++ = '\0'; - r = WideCharToMultiByte(CP_UTF8, - 0, - w_source_ptr, - w_source_len, - target, - target_len, - NULL, - NULL); - assert(r == target_len); - target[target_len] = '\0'; - *target_ptr = target; return 0; } -INLINE static int fs__readlink_handle(HANDLE handle, char** target_ptr, - uint64_t* target_len_ptr) { +INLINE static int fs__readlink_handle(HANDLE handle, + char** target_ptr, + size_t* target_len_ptr) { char buffer[MAXIMUM_REPARSE_DATA_BUFFER_SIZE]; REPARSE_DATA_BUFFER* reparse_data = (REPARSE_DATA_BUFFER*) buffer; WCHAR* w_target; @@ -439,7 +549,8 @@ INLINE static int fs__readlink_handle(HANDLE handle, char** target_ptr, return -1; } - return fs__wide_to_utf8(w_target, w_target_len, target_ptr, target_len_ptr); + assert(target_ptr == NULL || *target_ptr == NULL); + return fs__wide_to_wtf8(w_target, w_target_len, target_ptr, target_len_ptr); } @@ -1429,7 +1540,8 @@ void fs__scandir(uv_fs_t* req) { uv__dirent_t* dirent; size_t wchar_len; - size_t utf8_len; + size_t wtf8_len; + char* wtf8; /* Obtain a pointer to the current directory entry. */ position += next_entry_offset; @@ -1456,11 +1568,8 @@ void fs__scandir(uv_fs_t* req) { info->FileName[1] == L'.') continue; - /* Compute the space required to store the filename as UTF-8. */ - utf8_len = WideCharToMultiByte( - CP_UTF8, 0, &info->FileName[0], wchar_len, NULL, 0, NULL, NULL); - if (utf8_len == 0) - goto win32_error; + /* Compute the space required to store the filename as WTF-8. */ + wtf8_len = fs__get_length_wide(&info->FileName[0], wchar_len); /* Resize the dirent array if needed. */ if (dirents_used >= dirents_size) { @@ -1480,26 +1589,17 @@ void fs__scandir(uv_fs_t* req) { * includes room for the first character of the filename, but `utf8_len` * doesn't count the NULL terminator at this point. */ - dirent = uv__malloc(sizeof *dirent + utf8_len); + dirent = uv__malloc(sizeof *dirent + wtf8_len); if (dirent == NULL) goto out_of_memory_error; dirents[dirents_used++] = dirent; /* Convert file name to UTF-8. */ - if (WideCharToMultiByte(CP_UTF8, - 0, - &info->FileName[0], - wchar_len, - &dirent->d_name[0], - utf8_len, - NULL, - NULL) == 0) + wtf8 = &dirent->d_name[0]; + if (fs__wide_to_wtf8(&info->FileName[0], wchar_len, &wtf8, &wtf8_len) == -1) goto win32_error; - /* Add a null terminator to the filename. */ - dirent->d_name[utf8_len] = '\0'; - /* Fill out the type field. */ if (info->FileAttributes & FILE_ATTRIBUTE_DEVICE) dirent->d_type = UV__DT_CHAR; @@ -1708,6 +1808,7 @@ void fs__closedir(uv_fs_t* req) { INLINE static int fs__stat_handle(HANDLE handle, uv_stat_t* statbuf, int do_lstat) { + size_t target_length = 0; FILE_FS_DEVICE_INFORMATION device_info; FILE_ALL_INFORMATION file_info; FILE_FS_VOLUME_INFORMATION volume_info; @@ -1803,9 +1904,10 @@ INLINE static int fs__stat_handle(HANDLE handle, uv_stat_t* statbuf, * to be treated as a regular file. The higher level lstat function will * detect this failure and retry without do_lstat if appropriate. */ - if (fs__readlink_handle(handle, NULL, &statbuf->st_size) != 0) + if (fs__readlink_handle(handle, NULL, &target_length) != 0) return -1; statbuf->st_mode |= S_IFLNK; + statbuf->st_size = target_length; } if (statbuf->st_mode == 0) { @@ -2661,6 +2763,7 @@ static void fs__readlink(uv_fs_t* req) { return; } + assert(req->ptr == NULL); if (fs__readlink_handle(handle, (char**) &req->ptr, NULL) != 0) { DWORD error = GetLastError(); SET_REQ_WIN32_ERROR(req, error); @@ -2720,7 +2823,8 @@ static ssize_t fs__realpath_handle(HANDLE handle, char** realpath_ptr) { return -1; } - r = fs__wide_to_utf8(w_realpath_ptr, w_realpath_len, realpath_ptr, NULL); + assert(*realpath_ptr == NULL); + r = fs__wide_to_wtf8(w_realpath_ptr, w_realpath_len, realpath_ptr, NULL); uv__free(w_realpath_buf); return r; } @@ -2740,6 +2844,7 @@ static void fs__realpath(uv_fs_t* req) { return; } + assert(req->ptr == NULL); if (fs__realpath_handle(handle, (char**) &req->ptr) == -1) { CloseHandle(handle); SET_REQ_WIN32_ERROR(req, GetLastError()); diff --git a/test/test-fs.c b/test/test-fs.c index f9fa20ef..e687dde3 100644 --- a/test/test-fs.c +++ b/test/test-fs.c @@ -4550,6 +4550,7 @@ TEST_IMPL(fs_get_system_error) { return 0; } + TEST_IMPL(fs_stat_batch_multiple) { uv_fs_t req[300]; int r; @@ -4573,3 +4574,55 @@ TEST_IMPL(fs_stat_batch_multiple) { MAKE_VALGRIND_HAPPY(loop); return 0; } + + +#ifdef _WIN32 +TEST_IMPL(fs_wtf) { + int r; + HANDLE file_handle; + uv_dirent_t dent; + static char test_file_buf[PATHMAX]; + + /* set-up */ + _wunlink(L"test_dir/hi\xD801\x0037"); + rmdir("test_dir"); + + loop = uv_default_loop(); + + r = uv_fs_mkdir(NULL, &mkdir_req, "test_dir", 0777, NULL); + ASSERT_EQ(r, 0); + uv_fs_req_cleanup(&mkdir_req); + + file_handle = CreateFileW(L"test_dir/hi\xD801\x0037", + GENERIC_WRITE | FILE_WRITE_ATTRIBUTES, + 0, + NULL, + CREATE_ALWAYS, + FILE_FLAG_OPEN_REPARSE_POINT | + FILE_FLAG_BACKUP_SEMANTICS, + NULL); + ASSERT(file_handle != INVALID_HANDLE_VALUE); + + CloseHandle(file_handle); + + r = uv_fs_scandir(NULL, &scandir_req, "test_dir", 0, NULL); + ASSERT_EQ(r, 1); + ASSERT_EQ(scandir_req.result, 1); + ASSERT_NOT_NULL(scandir_req.ptr); + while (UV_EOF != uv_fs_scandir_next(&scandir_req, &dent)) { + snprintf(test_file_buf, sizeof(test_file_buf), "test_dir\\%s", dent.name); + printf("stat %s\n", test_file_buf); + r = uv_fs_stat(NULL, &stat_req, test_file_buf, NULL); + ASSERT_EQ(r, 0); + } + uv_fs_req_cleanup(&scandir_req); + ASSERT_NULL(scandir_req.ptr); + + /* clean-up */ + _wunlink(L"test_dir/hi\xD801\x0037"); + rmdir("test_dir"); + + MAKE_VALGRIND_HAPPY(loop); + return 0; +} +#endif diff --git a/test/test-list.h b/test/test-list.h index 68c9c117..6da78123 100644 --- a/test/test-list.h +++ b/test/test-list.h @@ -437,6 +437,7 @@ TEST_DECLARE (fs_file_flag_no_buffering) TEST_DECLARE (fs_open_readonly_acl) TEST_DECLARE (fs_fchmod_archive_readonly) TEST_DECLARE (fs_invalid_mkdir_name) +TEST_DECLARE (fs_wtf) #endif TEST_DECLARE (fs_get_system_error) TEST_DECLARE (strscpy) @@ -1120,6 +1121,7 @@ TASK_LIST_START TEST_ENTRY (fs_open_readonly_acl) TEST_ENTRY (fs_fchmod_archive_readonly) TEST_ENTRY (fs_invalid_mkdir_name) + TEST_ENTRY (fs_wtf) #endif TEST_ENTRY (fs_get_system_error) TEST_ENTRY (get_osfhandle_valid_handle)