linux: work around io_uring IORING_OP_CLOSE bug (#4059)

Work around a poorly understood bug in older kernels where closing a
file descriptor pointing to /foo/bar results in ETXTBSY errors when
trying to execve("/foo/bar") later on.

The bug seems to have been fixed somewhere between 5.15.85 and 5.15.90.
I couldn't pinpoint the responsible commit but good candidates are the
several data race fixes.

Interestingly, it seems to manifest only when running under Docker so
the possibility of a Docker bug can't be completely ruled out either.

This commit moves uv__kernel_version() from fs.c to linux.c because the
latter now uses it more than the former.

Fixes: https://github.com/nodejs/node/issues/48444
This commit is contained in:
Ben Noordhuis 2023-06-20 13:01:12 +02:00 committed by GitHub
parent 2bf97f123f
commit 1752791c9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 26 deletions

View File

@ -57,7 +57,6 @@
#if defined(__linux__)
# include <sys/sendfile.h>
# include <sys/utsname.h>
#endif
#if defined(__sun)
@ -899,31 +898,6 @@ out:
#ifdef __linux__
unsigned uv__kernel_version(void) {
static _Atomic unsigned cached_version;
struct utsname u;
unsigned version;
unsigned major;
unsigned minor;
unsigned patch;
version = atomic_load_explicit(&cached_version, memory_order_relaxed);
if (version != 0)
return version;
if (-1 == uname(&u))
return 0;
if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
return 0;
version = major * 65536 + minor * 256 + patch;
atomic_store_explicit(&cached_version, version, memory_order_relaxed);
return version;
}
/* Pre-4.20 kernels have a bug where CephFS uses the RADOS copy-from command
* in copy_file_range() when it shouldn't. There is no workaround except to
* fall back to a regular copy.

View File

@ -48,6 +48,7 @@
#include <sys/sysinfo.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/utsname.h>
#include <time.h>
#include <unistd.h>
@ -309,6 +310,31 @@ static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
}
unsigned uv__kernel_version(void) {
static _Atomic unsigned cached_version;
struct utsname u;
unsigned version;
unsigned major;
unsigned minor;
unsigned patch;
version = atomic_load_explicit(&cached_version, memory_order_relaxed);
if (version != 0)
return version;
if (-1 == uname(&u))
return 0;
if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
return 0;
version = major * 65536 + minor * 256 + patch;
atomic_store_explicit(&cached_version, version, memory_order_relaxed);
return version;
}
ssize_t
uv__fs_copy_file_range(int fd_in,
off_t* off_in,
@ -731,6 +757,17 @@ int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
struct uv__io_uring_sqe* sqe;
struct uv__iou* iou;
/* Work around a poorly understood bug in older kernels where closing a file
* descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
* execve("/foo/bar") later on. The bug seems to have been fixed somewhere
* between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
* but good candidates are the several data race fixes. Interestingly, it
* seems to manifest only when running under Docker so the possibility of
* a Docker bug can't be completely ruled out either. Yay, computers.
*/
if (uv__kernel_version() < /* 5.15.90 */ 0x050F5A)
return 0;
iou = &uv__get_internal_fields(loop)->iou;
sqe = uv__iou_get_sqe(iou, loop, req);