unix: rethink relaxed accept() approach

Benchmarks demonstrated that the idle timer handle approach didn't balance the load quite fair enough, the majority of new connections still ended up in one or two processes. The new approach voluntarily gives up a scheduler timeslice by calling nanosleep() with a one nanosecond timeout. Why not sched_yield()? Because on Linux (and this is probably true for other Unices as well), sched_yield() only yields if there are other processes running on the same CPU. nanosleep() on the other hand always forces the process to sleep, which gives other processes a chance to accept our pending connections.
2012-09-18 00:04:50 +02:00 · 2012-09-18 00:04:50 +02:00 · be2a2176ce
commit be2a2176ce
parent 37dc7472d7
3 changed files with 10 additions and 59 deletions
--- a/include/uv-private/uv-unix.h
+++ b/include/uv-private/uv-unix.h
@ -185,8 +185,7 @@ typedef struct {
  int fd;                                                                     \
  UV_STREAM_PRIVATE_PLATFORM_FIELDS                                           \
-#define UV_TCP_PRIVATE_FIELDS                                                 \
+#define UV_TCP_PRIVATE_FIELDS /* empty */
  uv_idle_t* idle_handle;  /* for UV_TCP_SINGLE_ACCEPT handles */             \
 #define UV_UDP_PRIVATE_FIELDS                                                 \
  int fd;                                                                     \
--- a/src/unix/stream.c
+++ b/src/unix/stream.c
@ -386,16 +386,6 @@ void uv__stream_destroy(uv_stream_t* stream) {
 }
 static void uv__next_accept(uv_idle_t* idle, int status) {
  uv_stream_t* stream = idle->data;
  uv_idle_stop(idle);
  if (stream->accepted_fd == -1)
    uv__io_start(stream->loop, &stream->read_watcher);
 }
 /* Implements a best effort approach to mitigating accept() EMFILE errors.
 * We have a spare file descriptor stashed away that we close to get below
 * the EMFILE limit. Next, we accept all pending connections and close them
@ -497,40 +487,17 @@ void uv__server_io(uv_loop_t* loop, uv__io_t* w, int events) {
    stream->accepted_fd = fd;
    stream->connection_cb(stream, 0);
-    if (stream->accepted_fd != -1 ||
+    if (stream->accepted_fd != -1) {
        (stream->type == UV_TCP && stream->flags == UV_TCP_SINGLE_ACCEPT)) {
      /* The user hasn't yet accepted called uv_accept() */
      uv__io_stop(loop, &stream->read_watcher);
-      break;
+      return;
    }
    }
-  if (stream->fd != -1 &&
+    if (stream->type == UV_TCP && (stream->flags & UV_TCP_SINGLE_ACCEPT)) {
-      stream->accepted_fd == -1 &&
+      /* Give other processes a chance to accept connections. */
-      (stream->type == UV_TCP && stream->flags == UV_TCP_SINGLE_ACCEPT))
+      struct timespec timeout = { 0, 1 };
-  {
+      nanosleep(&timeout, NULL);
-    /* Defer the next accept() syscall to the next event loop tick.
+    }
     * This lets us guarantee fair load balancing in in multi-process setups.
     * The problem is as follows:
     *
     *  1. Multiple processes listen on the same socket.
     *  2. The OS scheduler commonly gives preference to one process to
     *     avoid task switches.
     *  3. That process therefore accepts most of the new connections,
     *     leading to a (sometimes very) unevenly distributed load.
     *
     * Here is how we mitigate this issue:
     *
     *  1. Accept a connection.
     *  2. Start an idle watcher.
     *  3. Don't accept new connections until the idle callback fires.
     *
     * This works because the callback only fires when there have been
     * no recent events, i.e. none of the watched file descriptors have
     * recently been readable or writable.
     */
    uv_tcp_t* tcp = (uv_tcp_t*) stream;
    uv_idle_start(tcp->idle_handle, uv__next_accept);
  }
 }
--- a/src/unix/tcp.c
+++ b/src/unix/tcp.c
@ -30,7 +30,6 @@
 int uv_tcp_init(uv_loop_t* loop, uv_tcp_t* tcp) {
  uv__stream_init(loop, (uv_stream_t*)tcp, UV_TCP);
  tcp->idle_handle = NULL;
  return 0;
 }
@ -245,20 +244,9 @@ int uv_tcp_listen(uv_tcp_t* tcp, int backlog, uv_connection_cb cb) {
    single_accept = (val == NULL) || (atoi(val) != 0); /* on by default */
  }
-  if (!single_accept)
+  if (single_accept)
    goto no_single_accept;
  tcp->idle_handle = malloc(sizeof(*tcp->idle_handle));
  if (tcp->idle_handle == NULL)
    return uv__set_sys_error(tcp->loop, ENOMEM);
  if (uv_idle_init(tcp->loop, tcp->idle_handle))
    abort();
  tcp->idle_handle->flags |= UV__HANDLE_INTERNAL;
    tcp->flags |= UV_TCP_SINGLE_ACCEPT;
 no_single_accept:
  if (maybe_new_socket(tcp, AF_INET, UV_STREAM_READABLE))
    return -1;
@ -397,8 +385,5 @@ int uv_tcp_simultaneous_accepts(uv_tcp_t* handle, int enable) {
 void uv__tcp_close(uv_tcp_t* handle) {
  if (handle->idle_handle)
    uv_close((uv_handle_t*)handle->idle_handle, (uv_close_cb)free);
  uv__stream_close((uv_stream_t*)handle);
 }