linux: work around io_uring IORING_OP_CLOSE bug (#4059)

Work around a poorly understood bug in older kernels where closing a file descriptor pointing to /foo/bar results in ETXTBSY errors when trying to execve("/foo/bar") later on. The bug seems to have been fixed somewhere between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit but good candidates are the several data race fixes. Interestingly, it seems to manifest only when running under Docker so the possibility of a Docker bug can't be completely ruled out either. This commit moves uv__kernel_version() from fs.c to linux.c because the latter now uses it more than the former. Fixes: https://github.com/nodejs/node/issues/48444
2023-06-20 13:01:12 +02:00 · 2023-06-20 13:01:12 +02:00 · 1752791c9e
commit 1752791c9e
parent 2bf97f123f
2 changed files with 37 additions and 26 deletions
--- a/src/unix/fs.c
+++ b/src/unix/fs.c
@ -57,7 +57,6 @@

 #if defined(__linux__)
 # include <sys/sendfile.h>
-# include <sys/utsname.h>
 #endif

 #if defined(__sun)
@ -899,31 +898,6 @@ out:


 #ifdef __linux__
-unsigned uv__kernel_version(void) {
-  static _Atomic unsigned cached_version;
-  struct utsname u;
-  unsigned version;
-  unsigned major;
-  unsigned minor;
-  unsigned patch;
-
-  version = atomic_load_explicit(&cached_version, memory_order_relaxed);
-  if (version != 0)
-    return version;
-
-  if (-1 == uname(&u))
-    return 0;
-
-  if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
-    return 0;
-
-  version = major * 65536 + minor * 256 + patch;
-  atomic_store_explicit(&cached_version, version, memory_order_relaxed);
-
-  return version;
-}
-
-
 /* Pre-4.20 kernels have a bug where CephFS uses the RADOS copy-from command
 * in copy_file_range() when it shouldn't. There is no workaround except to
 * fall back to a regular copy.
--- a/src/unix/linux.c
+++ b/src/unix/linux.c
@ -48,6 +48,7 @@
 #include <sys/sysinfo.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
+#include <sys/utsname.h>
 #include <time.h>
 #include <unistd.h>

@ -309,6 +310,31 @@ static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
 }


+unsigned uv__kernel_version(void) {
+  static _Atomic unsigned cached_version;
+  struct utsname u;
+  unsigned version;
+  unsigned major;
+  unsigned minor;
+  unsigned patch;
+
+  version = atomic_load_explicit(&cached_version, memory_order_relaxed);
+  if (version != 0)
+    return version;
+
+  if (-1 == uname(&u))
+    return 0;
+
+  if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
+    return 0;
+
+  version = major * 65536 + minor * 256 + patch;
+  atomic_store_explicit(&cached_version, version, memory_order_relaxed);
+
+  return version;
+}
+
+
 ssize_t
 uv__fs_copy_file_range(int fd_in,
                       off_t* off_in,
@ -731,6 +757,17 @@ int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
  struct uv__io_uring_sqe* sqe;
  struct uv__iou* iou;

+  /* Work around a poorly understood bug in older kernels where closing a file
+   * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
+   * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
+   * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
+   * but good candidates are the several data race fixes. Interestingly, it
+   * seems to manifest only when running under Docker so the possibility of
+   * a Docker bug can't be completely ruled out either. Yay, computers.
+   */
+  if (uv__kernel_version() < /* 5.15.90 */ 0x050F5A)
+    return 0;
+
  iou = &uv__get_internal_fields(loop)->iou;

  sqe = uv__iou_get_sqe(iou, loop, req);