From 52e1a9e9973afdc39c70410d6034e3fe7e5bd458 Mon Sep 17 00:00:00 2001
From: Ryan Dahl <ry@tinyclouds.org>
Date: Sat, 6 Aug 2011 00:23:59 -0700
Subject: [PATCH] Upgrade eio (it has better msvs support)

---
 include/eio.h           |  116 ++--
 src/eio/Changes         |   30 +-
 src/eio/Makefile.am     |    2 +-
 src/eio/autogen.sh      |    8 +-
 src/eio/config_darwin.h |   61 +-
 src/eio/configure.ac    |    8 +-
 src/eio/ecb.h           |  370 ++++++++++
 src/eio/eio.c           | 1410 +++++++++++++++++++++++++--------------
 src/eio/eio.pod         |  758 +++++++++++++++++++--
 src/eio/libeio.m4       |   39 ++
 src/eio/xthread.h       |   41 +-
 11 files changed, 2197 insertions(+), 646 deletions(-)
 create mode 100644 src/eio/ecb.h

diff --git a/include/eio.h b/include/eio.h
index 3dd32676..bade4e77 100644
--- a/include/eio.h
+++ b/include/eio.h
@@ -1,7 +1,7 @@
 /*
  * libeio API header
  *
- * Copyright (c) 2007,2008,2009,2010 Marc Alexander Lehmann <libeio@schmorp.de>
+ * Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann <libeio@schmorp.de>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modifica-
@@ -45,17 +45,9 @@ extern "C" {
 #endif
 
 #include <stddef.h>
+#include <signal.h>
 #include <sys/types.h>
 
-#ifdef __OpenBSD__
-# include <inttypes.h>
-#endif
-
-#ifdef _WIN32
-# define uid_t int
-# define gid_t int
-#endif
-
 typedef struct eio_req    eio_req;
 typedef struct eio_dirent eio_dirent;
 
@@ -67,12 +59,33 @@ typedef int (*eio_cb)(eio_req *req);
 
 #ifndef EIO_STRUCT_STAT
 # ifdef _WIN32
-#   define EIO_STRUCT_STAT struct _stati64
+#  define EIO_STRUCT_STAT struct _stati64
+#  define EIO_STRUCT_STATI64
 # else
-#   define EIO_STRUCT_STAT struct stat
+#  define EIO_STRUCT_STAT struct stat
 # endif
 #endif
 
+#ifdef _WIN32
+  typedef int      eio_uid_t;
+  typedef int      eio_gid_t;
+  #ifdef __MINGW32__ /* no intptr_t */
+    typedef ssize_t  eio_ssize_t;
+  #else
+    typedef intptr_t eio_ssize_t; /* or SSIZE_T */
+  #endif
+  #if __GNUC__
+    typedef long long eio_ino_t;
+  #else
+    typedef __int64   eio_ino_t; /* unsigned not supported by msvc */
+  #endif
+#else
+  typedef uid_t    eio_uid_t;
+  typedef gid_t    eio_gid_t;
+  typedef ssize_t  eio_ssize_t;
+  typedef ino_t    eio_ino_t;
+#endif
+
 #ifndef EIO_STRUCT_STATVFS
 # define EIO_STRUCT_STATVFS struct statvfs
 #endif
@@ -119,7 +132,7 @@ struct eio_dirent
   unsigned short namelen; /* size of filename without trailing 0 */
   unsigned char type; /* one of EIO_DT_* */
   signed char score; /* internal use */
-  ino_t inode; /* the inode number, if available, otherwise unspecified */
+  eio_ino_t inode; /* the inode number, if available, otherwise unspecified */
 };
 
 /* eio_msync flags */
@@ -131,14 +144,12 @@ enum
 };
 
 /* eio_mtouch flags */
-
 enum
 {
   EIO_MT_MODIFY     = 1
 };
 
 /* eio_sync_file_range flags */
-
 enum
 {
   EIO_SYNC_FILE_RANGE_WAIT_BEFORE = 1,
@@ -146,10 +157,16 @@ enum
   EIO_SYNC_FILE_RANGE_WAIT_AFTER  = 4
 };
 
-typedef double eio_tstamp; /* feel free to use double in your code directly */
+/* eio_fallocate flags */
+enum
+{
+  EIO_FALLOC_FL_KEEP_SIZE = 1 /* MUST match the value in linux/falloc.h */
+};
+
+/* timestamps and differences - feel free to use double in your code directly */
+typedef double eio_tstamp;
 
 /* the eio request structure */
-
 enum
 {
   EIO_CUSTOM,
@@ -162,12 +179,12 @@ enum
   EIO_UTIME, EIO_FUTIME,
   EIO_CHMOD, EIO_FCHMOD,
   EIO_CHOWN, EIO_FCHOWN,
-  EIO_SYNC, EIO_FSYNC, EIO_FDATASYNC,
-  EIO_MSYNC, EIO_MTOUCH, EIO_SYNC_FILE_RANGE,
+  EIO_SYNC, EIO_FSYNC, EIO_FDATASYNC, EIO_SYNCFS,
+  EIO_MSYNC, EIO_MTOUCH, EIO_SYNC_FILE_RANGE, EIO_FALLOCATE,
   EIO_MLOCK, EIO_MLOCKALL,
   EIO_UNLINK, EIO_RMDIR, EIO_MKDIR, EIO_RENAME,
   EIO_MKNOD, EIO_READDIR,
-  EIO_LINK, EIO_SYMLINK, EIO_READLINK,
+  EIO_LINK, EIO_SYMLINK, EIO_READLINK, EIO_REALPATH,
   EIO_GROUP, EIO_NOP,
   EIO_BUSY
 };
@@ -176,7 +193,7 @@ enum
 enum
 {
   EIO_MCL_CURRENT = 1,
-  EIO_MCL_FUTURE  = 2
+  EIO_MCL_FUTURE  = 2,
 };
 
 /* request priorities */
@@ -184,7 +201,7 @@ enum
 enum {
   EIO_PRI_MIN     = -4,
   EIO_PRI_MAX     =  4,
-  EIO_PRI_DEFAULT =  0
+  EIO_PRI_DEFAULT =  0,
 };
 
 /* eio request structure */
@@ -194,9 +211,9 @@ struct eio_req
 {
   eio_req volatile *next; /* private ETP */
 
-  ssize_t result;  /* result of syscall, e.g. result = read (... */
-  off_t offs;      /* read, write, truncate, readahead, sync_file_range: file offset */
-  size_t size;     /* read, write, readahead, sendfile, msync, mlock, sync_file_range: length */
+  eio_ssize_t result;  /* result of syscall, e.g. result = read (... */
+  off_t offs;      /* read, write, truncate, readahead, sync_file_range, fallocate: file offset, mknod: dev_t */
+  size_t size;     /* read, write, readahead, sendfile, msync, mlock, sync_file_range, fallocate: length */
   void *ptr1;      /* all applicable requests: pathname, old name; readdir: optional eio_dirents */
   void *ptr2;      /* all applicable requests: new name or memory buffer; readdir: name strings */
   eio_tstamp nv1;  /* utime, futime: atime; busy: sleep time */
@@ -204,16 +221,22 @@ struct eio_req
 
   int type;        /* EIO_xxx constant ETP */
   int int1;        /* all applicable requests: file descriptor; sendfile: output fd; open, msync, mlockall, readdir: flags */
-  long int2;       /* chown, fchown: uid; sendfile: input fd; open, chmod, mkdir, mknod: file mode, sync_file_range: flags */
-  long int3;       /* chown, fchown: gid; mknod: dev_t */
+  long int2;       /* chown, fchown: uid; sendfile: input fd; open, chmod, mkdir, mknod: file mode, sync_file_range, fallocate: flags */
+  long int3;       /* chown, fchown: gid */
   int errorno;     /* errno value on syscall return */
 
+#if __i386 || __amd64
+  unsigned char cancelled;
+#else
+  sig_atomic_t cancelled;
+#endif
+
   unsigned char flags; /* private */
   signed char pri;     /* the priority */
 
   void *data;
   eio_cb finish;
-  void (*destroy)(eio_req *req); /* called when requets no longer needed */
+  void (*destroy)(eio_req *req); /* called when request no longer needed */
   void (*feed)(eio_req *req);    /* only used for group requests */
 
   EIO_REQ_MEMBERS
@@ -223,10 +246,9 @@ struct eio_req
 
 /* _private_ request flags */
 enum {
-  EIO_FLAG_CANCELLED = 0x01, /* request was cancelled */
-  EIO_FLAG_PTR1_FREE = 0x02, /* need to free(ptr1) */
-  EIO_FLAG_PTR2_FREE = 0x04, /* need to free(ptr2) */
-  EIO_FLAG_GROUPADD  = 0x08  /* some request was added to the group */
+  EIO_FLAG_PTR1_FREE = 0x01, /* need to free(ptr1) */
+  EIO_FLAG_PTR2_FREE = 0x02, /* need to free(ptr2) */
+  EIO_FLAG_GROUPADD  = 0x04  /* some request was added to the group */
 };
 
 /* undocumented/unsupported/private helper */
@@ -254,14 +276,15 @@ void eio_set_max_poll_reqs (unsigned int nreqs);
 void eio_set_min_parallel (unsigned int nthreads);
 void eio_set_max_parallel (unsigned int nthreads);
 void eio_set_max_idle     (unsigned int nthreads);
+void eio_set_idle_timeout (unsigned int seconds);
 
 unsigned int eio_nreqs    (void); /* number of requests in-flight */
 unsigned int eio_nready   (void); /* number of not-yet handled requests */
-unsigned int eio_npending (void); /* numbe rof finished but unhandled requests */
+unsigned int eio_npending (void); /* number of finished but unhandled requests */
 unsigned int eio_nthreads (void); /* number of worker threads in use currently */
 
 /*****************************************************************************/
-/* convinience wrappers */
+/* convenience wrappers */
 
 #ifndef EIO_NO_WRAPPERS
 eio_req *eio_nop       (int pri, eio_cb cb, void *data); /* does nothing except go through the whole process */
@@ -269,11 +292,13 @@ eio_req *eio_busy      (eio_tstamp delay, int pri, eio_cb cb, void *data); /* ti
 eio_req *eio_sync      (int pri, eio_cb cb, void *data);
 eio_req *eio_fsync     (int fd, int pri, eio_cb cb, void *data);
 eio_req *eio_fdatasync (int fd, int pri, eio_cb cb, void *data);
+eio_req *eio_syncfs    (int fd, int pri, eio_cb cb, void *data);
 eio_req *eio_msync     (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data);
 eio_req *eio_mtouch    (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data);
 eio_req *eio_mlock     (void *addr, size_t length, int pri, eio_cb cb, void *data);
 eio_req *eio_mlockall  (int flags, int pri, eio_cb cb, void *data);
 eio_req *eio_sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags, int pri, eio_cb cb, void *data);
+eio_req *eio_fallocate (int fd, int mode, off_t offset, size_t len, int pri, eio_cb cb, void *data);
 eio_req *eio_close     (int fd, int pri, eio_cb cb, void *data);
 eio_req *eio_readahead (int fd, off_t offset, size_t length, int pri, eio_cb cb, void *data);
 eio_req *eio_read      (int fd, void *buf, size_t length, off_t offset, int pri, eio_cb cb, void *data);
@@ -283,19 +308,20 @@ eio_req *eio_fstatvfs  (int fd, int pri, eio_cb cb, void *data); /* stat buffer=
 eio_req *eio_futime    (int fd, eio_tstamp atime, eio_tstamp mtime, int pri, eio_cb cb, void *data);
 eio_req *eio_ftruncate (int fd, off_t offset, int pri, eio_cb cb, void *data);
 eio_req *eio_fchmod    (int fd, mode_t mode, int pri, eio_cb cb, void *data);
-eio_req *eio_fchown    (int fd, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data);
+eio_req *eio_fchown    (int fd, eio_uid_t uid, eio_gid_t gid, int pri, eio_cb cb, void *data);
 eio_req *eio_dup2      (int fd, int fd2, int pri, eio_cb cb, void *data);
 eio_req *eio_sendfile  (int out_fd, int in_fd, off_t in_offset, size_t length, int pri, eio_cb cb, void *data);
 eio_req *eio_open      (const char *path, int flags, mode_t mode, int pri, eio_cb cb, void *data);
 eio_req *eio_utime     (const char *path, eio_tstamp atime, eio_tstamp mtime, int pri, eio_cb cb, void *data);
 eio_req *eio_truncate  (const char *path, off_t offset, int pri, eio_cb cb, void *data);
-eio_req *eio_chown     (const char *path, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data);
+eio_req *eio_chown     (const char *path, eio_uid_t uid, eio_gid_t gid, int pri, eio_cb cb, void *data);
 eio_req *eio_chmod     (const char *path, mode_t mode, int pri, eio_cb cb, void *data);
 eio_req *eio_mkdir     (const char *path, mode_t mode, int pri, eio_cb cb, void *data);
 eio_req *eio_readdir   (const char *path, int flags, int pri, eio_cb cb, void *data); /* result=ptr2 allocated dynamically */
 eio_req *eio_rmdir     (const char *path, int pri, eio_cb cb, void *data);
 eio_req *eio_unlink    (const char *path, int pri, eio_cb cb, void *data);
 eio_req *eio_readlink  (const char *path, int pri, eio_cb cb, void *data); /* result=ptr2 allocated dynamically */
+eio_req *eio_realpath  (const char *path, int pri, eio_cb cb, void *data); /* result=ptr2 allocated dynamically */
 eio_req *eio_stat      (const char *path, int pri, eio_cb cb, void *data); /* stat buffer=ptr2 allocated dynamically */
 eio_req *eio_lstat     (const char *path, int pri, eio_cb cb, void *data); /* stat buffer=ptr2 allocated dynamically */
 eio_req *eio_statvfs   (const char *path, int pri, eio_cb cb, void *data); /* stat buffer=ptr2 allocated dynamically */
@@ -303,7 +329,7 @@ eio_req *eio_mknod     (const char *path, mode_t mode, dev_t dev, int pri, eio_c
 eio_req *eio_link      (const char *path, const char *new_path, int pri, eio_cb cb, void *data);
 eio_req *eio_symlink   (const char *path, const char *new_path, int pri, eio_cb cb, void *data);
 eio_req *eio_rename    (const char *path, const char *new_path, int pri, eio_cb cb, void *data);
-eio_req *eio_custom    (eio_cb execute, int pri, eio_cb cb, void *data);
+eio_req *eio_custom    (void (*execute)(eio_req *), int pri, eio_cb cb, void *data);
 #endif
 
 /*****************************************************************************/
@@ -319,7 +345,7 @@ void eio_grp_cancel    (eio_req *grp); /* cancels all sub requests but not the g
 /* request api */
 
 /* true if the request was cancelled, useful in the invoke callback */
-#define EIO_CANCELLED(req)   ((req)->flags & EIO_FLAG_CANCELLED)
+#define EIO_CANCELLED(req)   ((req)->cancelled)
 
 #define EIO_RESULT(req)      ((req)->result)
 /* returns a pointer to the result buffer allocated by eio */
@@ -332,21 +358,11 @@ void eio_grp_cancel    (eio_req *grp); /* cancels all sub requests but not the g
 void eio_submit (eio_req *req);
 /* cancel a request as soon fast as possible, if possible */
 void eio_cancel (eio_req *req);
-/* destroy a request that has never been submitted */
-void eio_destroy (eio_req *req);
 
 /*****************************************************************************/
-/* convinience functions */
+/* convenience functions */
 
-ssize_t eio_sendfile_sync (int ofd, int ifd, off_t offset, size_t count);
-
-/*****************************************************************************/
-/* export these so node_file can use these function instead of pread/write */
-
-#if !HAVE_PREADWRITE
-ssize_t eio__pread (int fd, void *buf, size_t count, off_t offset);
-ssize_t eio__pwrite (int fd, void *buf, size_t count, off_t offset);
-#endif
+eio_ssize_t eio_sendfile_sync (int ofd, int ifd, off_t offset, size_t count);
 
 #ifdef __cplusplus
 }
diff --git a/src/eio/Changes b/src/eio/Changes
index baa94eca..9d3e3231 100644
--- a/src/eio/Changes
+++ b/src/eio/Changes
@@ -1,9 +1,19 @@
 Revision history for libeio
 
 TODO: maybe add mincore support? available on at least darwin, solaris, linux, freebsd
-TODO: openbsd requites stdint.h for intptr_t - why posix?
+TODO: openbsd requires stdint.h for intptr_t - why posix?
 
+TODO: make mtouch/readdir maybe others cancellable in-request
+TODO: fadvise request
 1.0
+	- fix a deadlock where a wakeup signal could be missed when
+          a timeout occured at the same time.
+	- use nonstandard but maybe-working-on-bsd fork technique.
+        - use fewer time() syscalls when waiting for new requests.
+        - fix a path-memory-leak in readdir when using the wrappers
+          (reported by Thomas L. Shinnick).
+	- support a max_idle value of 0.
+	- support setting of idle timeout value (eio_set_idle_timeout).
         - readdir: correctly handle malloc failures.
         - readdir: new flags argument, can return inode
           and possibly filetype, can sort in various ways.
@@ -32,4 +42,22 @@ TODO: openbsd requites stdint.h for intptr_t - why posix?
           utimes but not futimes.
         - use _POSIX_MEMLOCK_RANGE for mlock.
         - do not (errornously) overwrite CFLAGS in configure.ac.
+        - mknod used int3 for dev_t (§2 bit), not offs (64 bit).
+        - fix memory corruption in eio_readdirx for the flags
+          combination EIO_READDIR_STAT_ORDER | EIO_READDIR_DIRS_FIRST.
+        - port to openbsd (another blatantly broken non-UNIX/POSIX platform).
+        - fix eio_custom prototype.
+        - work around a Linux (and likely FreeBSD and other kernels) bug
+          where sendfile would not transfer all the requested bytes on
+          large transfers, using a heuristic.
+        - use libecb, and apply lots of minor space optimisations.
+        - disable sendfile on darwin, broken as everything else.
+        - add realpath request and implementation.
+	- cancelled requests will still invoke their request callbacks.
+        - add fallocate.
+        - do not acquire any locks when forking.
+        - incorporated some mingw32 changes by traviscline.
+        - added syncfs support, using direct syscall.
+        - set thread name on linux (ps -L/Hcx, top, gdb).
+        - remove useless use of volatile variables.
 
diff --git a/src/eio/Makefile.am b/src/eio/Makefile.am
index 857d26b6..e9866c0d 100644
--- a/src/eio/Makefile.am
+++ b/src/eio/Makefile.am
@@ -10,6 +10,6 @@ include_HEADERS = eio.h
 
 lib_LTLIBRARIES = libeio.la
 
-libeio_la_SOURCES = eio.c xthread.h config.h
+libeio_la_SOURCES = eio.c ecb.h xthread.h config.h
 libeio_la_LDFLAGS = -version-info $(VERSION_INFO)
 
diff --git a/src/eio/autogen.sh b/src/eio/autogen.sh
index bd3387c4..8056ee7f 100755
--- a/src/eio/autogen.sh
+++ b/src/eio/autogen.sh
@@ -1,5 +1,3 @@
-libtoolize
-aclocal
-automake --add-missing
-autoconf
-autoheader
+#!/bin/sh
+
+autoreconf --install --symlink --force
diff --git a/src/eio/config_darwin.h b/src/eio/config_darwin.h
index 84a3440d..f4c4da97 100644
--- a/src/eio/config_darwin.h
+++ b/src/eio/config_darwin.h
@@ -4,9 +4,11 @@
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
 
-/* fdatasync(2) is not available on 10.5 but is on 10.6 
- * How should we deal with this? */
-/* #define HAVE_FDATASYNC 0 */
+/* fallocate(2) is available */
+/* #undef HAVE_FALLOCATE */
+
+/* fdatasync(2) is available */
+#define HAVE_FDATASYNC 1
 
 /* futimes(2) is available */
 #define HAVE_FUTIMES 1
@@ -17,6 +19,15 @@
 /* Define to 1 if you have the <memory.h> header file. */
 #define HAVE_MEMORY_H 1
 
+/* posix_fadvise(2) is available */
+/* #undef HAVE_POSIX_FADVISE */
+
+/* posix_madvise(2) is available */
+#define HAVE_POSIX_MADVISE 1
+
+/* prctl(PR_SET_NAME) is available */
+/* #undef HAVE_PRCTL_SET_NAME */
+
 /* pread(2) and pwrite(2) are available */
 #define HAVE_PREADWRITE 1
 
@@ -41,15 +52,27 @@
 /* sync_file_range(2) is available */
 /* #undef HAVE_SYNC_FILE_RANGE */
 
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+/* #undef HAVE_SYS_PRCTL_H */
+
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
+/* syscall(__NR_syncfs) is available */
+/* #undef HAVE_SYS_SYNCFS */
+
+/* Define to 1 if you have the <sys/syscall.h> header file. */
+#define HAVE_SYS_SYSCALL_H 1
+
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
+/* utimes(2) is available */
+#define HAVE_UTIMES 1
+
 /* Define to the sub-directory in which libtool stores uninstalled libraries.
    */
 #define LT_OBJDIR ".libs/"
@@ -78,5 +101,37 @@
 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
 
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
 /* Version number of package */
 #define VERSION "1.0"
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
diff --git a/src/eio/configure.ac b/src/eio/configure.ac
index 9f4cea9d..9faffad5 100644
--- a/src/eio/configure.ac
+++ b/src/eio/configure.ac
@@ -5,17 +5,17 @@ AC_CONFIG_HEADERS([config.h])
 
 AM_INIT_AUTOMAKE(libeio,1.0)
 AM_MAINTAINER_MODE
+
+AC_GNU_SOURCE
+
 AC_PROG_LIBTOOL
 
 AC_PROG_CC
 
 if test "x$GCC" = xyes ; then
-  CFLAGS="$CFLAGS -O3"
+  CFLAGS="-O3 $CFLAGS"
 fi
 
-dnl somebody will forgive me
-CFLAGS="-D_GNU_SOURCE $CFLAGS"
-
 m4_include([libeio.m4])
 
 AC_CONFIG_FILES([Makefile])
diff --git a/src/eio/ecb.h b/src/eio/ecb.h
new file mode 100644
index 00000000..8383374a
--- /dev/null
+++ b/src/eio/ecb.h
@@ -0,0 +1,370 @@
+/*
+ * libecb - http://software.schmorp.de/pkg/libecb
+ *
+ * Copyright (©) 2009-2011 Marc Alexander Lehmann <libecb@schmorp.de>
+ * Copyright (©) 2011 Emanuele Giaquinta
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ECB_H
+#define ECB_H
+
+#ifdef _WIN32
+  typedef   signed char   int8_t;
+  typedef unsigned char  uint8_t;
+  typedef   signed short  int16_t;
+  typedef unsigned short uint16_t;
+  typedef   signed int    int32_t;
+  typedef unsigned int   uint32_t;
+  #if __GNUC__
+    typedef   signed long long int64_t;
+    typedef unsigned long long uint64_t;
+  #else /* _MSC_VER || __BORLANDC__ */
+    typedef   signed __int64   int64_t;
+    typedef unsigned __int64   uint64_t;
+  #endif
+#else
+  #include <inttypes.h>
+#endif
+
+/* many compilers define _GNUC_ to some versions but then only implement
+ * what their idiot authors think are the "more important" extensions,
+ * causing enourmous grief in return for some better fake benchmark numbers.
+ * or so.
+ * we try to detect these and simply assume they are not gcc - if they have
+ * an issue with that they should have done it right in the first place.
+ */
+#ifndef ECB_GCC_VERSION
+  #if !defined(__GNUC_MINOR__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) || defined(__llvm__) || defined(__clang__)
+    #define ECB_GCC_VERSION(major,minor) 0
+  #else
+    #define ECB_GCC_VERSION(major,minor) (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+  #endif
+#endif
+
+/*****************************************************************************/
+
+#ifndef ECB_MEMORY_FENCE
+  #if ECB_GCC_VERSION(2,5)
+    #if __x86
+      #define ECB_MEMORY_FENCE         __asm__ __volatile__ ("lock; orb $0, -1(%%esp)" : : : "memory")
+      #define ECB_MEMORY_FENCE_ACQUIRE ECB_MEMORY_FENCE /* non-lock xchg might be enough */
+      #define ECB_MEMORY_FENCE_RELEASE do { } while (0) /* unlikely to change in future cpus */
+    #elif __amd64
+      #define ECB_MEMORY_FENCE         __asm__ __volatile__ ("mfence" : : : "memory")
+      #define ECB_MEMORY_FENCE_ACQUIRE __asm__ __volatile__ ("lfence" : : : "memory")
+      #define ECB_MEMORY_FENCE_RELEASE __asm__ __volatile__ ("sfence") /* play safe - not needed in any current cpu */
+    #endif
+  #endif
+#endif
+
+#ifndef ECB_MEMORY_FENCE
+  #if ECB_GCC_VERSION(4,4)
+    #define ECB_MEMORY_FENCE         __sync_synchronize ()
+    #define ECB_MEMORY_FENCE_ACQUIRE ({ char dummy = 0; __sync_lock_test_and_set (&dummy, 1); })
+    #define ECB_MEMORY_FENCE_RELEASE ({ char dummy = 1; __sync_lock_release      (&dummy   ); })
+  #elif _MSC_VER >= 1400 /* VC++ 2005 */
+    #pragma intrinsic(_ReadBarrier,_WriteBarrier,_ReadWriteBarrier)
+    #define ECB_MEMORY_FENCE         _ReadWriteBarrier ()
+    #define ECB_MEMORY_FENCE_ACQUIRE _ReadWriteBarrier () /* according to msdn, _ReadBarrier is not a load fence */
+    #define ECB_MEMORY_FENCE_RELEASE _WriteBarrier ()
+  #elif defined(_WIN32)
+    #include <WinNT.h>
+    #define ECB_MEMORY_FENCE         MemoryBarrier () /* actually just xchg on x86... scary */
+    #define ECB_MEMORY_FENCE_ACQUIRE ECB_MEMORY_FENCE
+    #define ECB_MEMORY_FENCE_RELEASE ECB_MEMORY_FENCE
+  #endif
+#endif
+
+#ifndef ECB_MEMORY_FENCE
+  /*
+   * if you get undefined symbol references to pthread_mutex_lock,
+   * or failure to find pthread.h, then you should implement
+   * the ECB_MEMORY_FENCE operations for your cpu/compiler
+   * OR proide pthread.h and link against the posix thread library
+   * of your system.
+   */
+  #include <pthread.h>
+
+  static pthread_mutex_t ecb_mf_lock = PTHREAD_MUTEX_INITIALIZER;
+  #define ECB_MEMORY_FENCE do { pthread_mutex_lock (&ecb_mf_lock); pthread_mutex_unlock (&ecb_mf_lock); } while (0)
+  #define ECB_MEMORY_FENCE_ACQUIRE ECB_MEMORY_FENCE
+  #define ECB_MEMORY_FENCE_RELEASE ECB_MEMORY_FENCE
+#endif
+
+/*****************************************************************************/
+
+#define ECB_C99 (__STDC_VERSION__ >= 199901L)
+
+#if __cplusplus
+  #define ecb_inline static inline
+#elif ECB_GCC_VERSION(2,5)
+  #define ecb_inline static __inline__
+#elif ECB_C99
+  #define ecb_inline static inline
+#else
+  #define ecb_inline static
+#endif
+
+#if ECB_GCC_VERSION(3,3)
+  #define ecb_restrict __restrict__
+#elif ECB_C99
+  #define ecb_restrict restrict
+#else
+  #define ecb_restrict
+#endif
+
+typedef int ecb_bool;
+
+#define ECB_CONCAT_(a, b) a ## b
+#define ECB_CONCAT(a, b) ECB_CONCAT_(a, b)
+#define ECB_STRINGIFY_(a) # a
+#define ECB_STRINGIFY(a) ECB_STRINGIFY_(a)
+
+#define ecb_function_ ecb_inline
+
+#if ECB_GCC_VERSION(3,1)
+  #define ecb_attribute(attrlist)        __attribute__(attrlist)
+  #define ecb_is_constant(expr)          __builtin_constant_p (expr)
+  #define ecb_expect(expr,value)         __builtin_expect ((expr),(value))
+  #define ecb_prefetch(addr,rw,locality) __builtin_prefetch (addr, rw, locality)
+#else
+  #define ecb_attribute(attrlist)
+  #define ecb_is_constant(expr)          0
+  #define ecb_expect(expr,value)         (expr)
+  #define ecb_prefetch(addr,rw,locality)
+#endif
+
+/* no emulation for ecb_decltype */
+#if ECB_GCC_VERSION(4,5)
+  #define ecb_decltype(x) __decltype(x)
+#elif ECB_GCC_VERSION(3,0)
+  #define ecb_decltype(x) __typeof(x)
+#endif
+
+#define ecb_noinline   ecb_attribute ((__noinline__))
+#define ecb_noreturn   ecb_attribute ((__noreturn__))
+#define ecb_unused     ecb_attribute ((__unused__))
+#define ecb_const      ecb_attribute ((__const__))
+#define ecb_pure       ecb_attribute ((__pure__))
+
+#if ECB_GCC_VERSION(4,3)
+  #define ecb_artificial ecb_attribute ((__artificial__))
+  #define ecb_hot        ecb_attribute ((__hot__))
+  #define ecb_cold       ecb_attribute ((__cold__))
+#else
+  #define ecb_artificial
+  #define ecb_hot
+  #define ecb_cold
+#endif
+
+/* put around conditional expressions if you are very sure that the  */
+/* expression is mostly true or mostly false. note that these return */
+/* booleans, not the expression.                                     */
+#define ecb_expect_false(expr) ecb_expect (!!(expr), 0)
+#define ecb_expect_true(expr)  ecb_expect (!!(expr), 1)
+/* for compatibility to the rest of the world */
+#define ecb_likely(expr)   ecb_expect_true  (expr)
+#define ecb_unlikely(expr) ecb_expect_false (expr)
+
+/* count trailing zero bits and count # of one bits */
+#if ECB_GCC_VERSION(3,4)
+  /* we assume int == 32 bit, long == 32 or 64 bit and long long == 64 bit */
+  #define ecb_ld32(x)      (__builtin_clz      (x) ^ 31)
+  #define ecb_ld64(x)      (__builtin_clzll    (x) ^ 63)
+  #define ecb_ctz32(x)      __builtin_ctz      (x)
+  #define ecb_ctz64(x)      __builtin_ctzll    (x)
+  #define ecb_popcount32(x) __builtin_popcount (x)
+  /* no popcountll */
+#else
+  ecb_function_ int ecb_ctz32 (uint32_t x) ecb_const;
+  ecb_function_ int
+  ecb_ctz32 (uint32_t x)
+  {
+    int r = 0;
+
+    x &= ~x + 1; /* this isolates the lowest bit */
+
+#if ECB_branchless_on_i386
+    r += !!(x & 0xaaaaaaaa) << 0;
+    r += !!(x & 0xcccccccc) << 1;
+    r += !!(x & 0xf0f0f0f0) << 2;
+    r += !!(x & 0xff00ff00) << 3;
+    r += !!(x & 0xffff0000) << 4;
+#else
+    if (x & 0xaaaaaaaa) r +=  1;
+    if (x & 0xcccccccc) r +=  2;
+    if (x & 0xf0f0f0f0) r +=  4;
+    if (x & 0xff00ff00) r +=  8;
+    if (x & 0xffff0000) r += 16;
+#endif
+
+    return r;
+  }
+
+  ecb_function_ int ecb_ctz64 (uint64_t x) ecb_const;
+  ecb_function_ int
+  ecb_ctz64 (uint64_t x)
+  {
+    int shift = x & 0xffffffffU ? 0 : 32;
+    return ecb_ctz32 (x >> shift) + shift;
+  }
+
+  ecb_function_ int ecb_popcount32 (uint32_t x) ecb_const;
+  ecb_function_ int
+  ecb_popcount32 (uint32_t x)
+  {
+    x -=  (x >> 1) & 0x55555555;
+    x  = ((x >> 2) & 0x33333333) + (x & 0x33333333);
+    x  = ((x >> 4) + x) & 0x0f0f0f0f;
+    x *= 0x01010101;
+
+    return x >> 24;
+  }
+
+  /* you have the choice beetween something with a table lookup, */
+  /* something using lots of bit arithmetic and a simple loop */
+  /* we went for the loop */
+  ecb_function_ int ecb_ld32 (uint32_t x) ecb_const;
+  ecb_function_ int ecb_ld32 (uint32_t x)
+  {
+    int r = 0;
+
+    if (x >> 16) { x >>= 16; r += 16; }
+    if (x >>  8) { x >>=  8; r +=  8; }
+    if (x >>  4) { x >>=  4; r +=  4; }
+    if (x >>  2) { x >>=  2; r +=  2; }
+    if (x >>  1) {           r +=  1; }
+
+    return r;
+  }
+
+  ecb_function_ int ecb_ld64 (uint64_t x) ecb_const;
+  ecb_function_ int ecb_ld64 (uint64_t x)
+  {
+    int r = 0;
+
+    if (x >> 32) { x >>= 32; r += 32; }
+
+    return r + ecb_ld32 (x);
+  }
+#endif
+
+/* popcount64 is only available on 64 bit cpus as gcc builtin */
+/* so for this version we are lazy */
+ecb_function_ int ecb_popcount64 (uint64_t x) ecb_const;
+ecb_function_ int
+ecb_popcount64 (uint64_t x)
+{
+  return ecb_popcount32 (x) + ecb_popcount32 (x >> 32);
+}
+
+ecb_inline uint8_t  ecb_rotl8  (uint8_t  x, unsigned int count) ecb_const;
+ecb_inline uint8_t  ecb_rotr8  (uint8_t  x, unsigned int count) ecb_const;
+ecb_inline uint16_t ecb_rotl16 (uint16_t x, unsigned int count) ecb_const;
+ecb_inline uint16_t ecb_rotr16 (uint16_t x, unsigned int count) ecb_const;
+ecb_inline uint32_t ecb_rotl32 (uint32_t x, unsigned int count) ecb_const;
+ecb_inline uint32_t ecb_rotr32 (uint32_t x, unsigned int count) ecb_const;
+ecb_inline uint64_t ecb_rotl64 (uint64_t x, unsigned int count) ecb_const;
+ecb_inline uint64_t ecb_rotr64 (uint64_t x, unsigned int count) ecb_const;
+
+ecb_inline uint8_t  ecb_rotl8  (uint8_t  x, unsigned int count) { return (x >> ( 8 - count)) | (x << count); }
+ecb_inline uint8_t  ecb_rotr8  (uint8_t  x, unsigned int count) { return (x << ( 8 - count)) | (x >> count); }
+ecb_inline uint16_t ecb_rotl16 (uint16_t x, unsigned int count) { return (x >> (16 - count)) | (x << count); }
+ecb_inline uint16_t ecb_rotr16 (uint16_t x, unsigned int count) { return (x << (16 - count)) | (x >> count); }
+ecb_inline uint32_t ecb_rotl32 (uint32_t x, unsigned int count) { return (x >> (32 - count)) | (x << count); }
+ecb_inline uint32_t ecb_rotr32 (uint32_t x, unsigned int count) { return (x << (32 - count)) | (x >> count); }
+ecb_inline uint64_t ecb_rotl64 (uint64_t x, unsigned int count) { return (x >> (64 - count)) | (x << count); }
+ecb_inline uint64_t ecb_rotr64 (uint64_t x, unsigned int count) { return (x << (64 - count)) | (x >> count); }
+
+#if ECB_GCC_VERSION(4,3)
+  #define ecb_bswap16(x) (__builtin_bswap32 (x) >> 16)
+  #define ecb_bswap32(x)  __builtin_bswap32 (x)
+  #define ecb_bswap64(x)  __builtin_bswap64 (x)
+#else
+  ecb_function_ uint16_t ecb_bswap16 (uint16_t x) ecb_const;
+  ecb_function_ uint16_t
+  ecb_bswap16 (uint16_t x)
+  {
+    return ecb_rotl16 (x, 8);
+  }
+
+  ecb_function_ uint32_t ecb_bswap32 (uint32_t x) ecb_const;
+  ecb_function_ uint32_t
+  ecb_bswap32 (uint32_t x)
+  {
+    return (((uint32_t)ecb_bswap16 (x)) << 16) | ecb_bswap16 (x >> 16);
+  }
+
+  ecb_function_ uint64_t ecb_bswap64 (uint64_t x) ecb_const;
+  ecb_function_ uint64_t
+  ecb_bswap64 (uint64_t x)
+  {
+    return (((uint64_t)ecb_bswap32 (x)) << 32) | ecb_bswap32 (x >> 32);
+  }
+#endif
+
+#if ECB_GCC_VERSION(4,5)
+  #define ecb_unreachable() __builtin_unreachable ()
+#else
+  /* this seems to work fine, but gcc always emits a warning for it :/ */
+  ecb_function_ void ecb_unreachable (void) ecb_noreturn;
+  ecb_function_ void ecb_unreachable (void) { }
+#endif
+
+/* try to tell the compiler that some condition is definitely true */
+#define ecb_assume(cond) do { if (!(cond)) ecb_unreachable (); } while (0)
+
+ecb_function_ unsigned char ecb_byteorder_helper (void) ecb_const;
+ecb_function_ unsigned char
+ecb_byteorder_helper (void)
+{
+  const uint32_t u = 0x11223344;
+  return *(unsigned char *)&u;
+}
+
+ecb_function_ ecb_bool ecb_big_endian    (void) ecb_const;
+ecb_function_ ecb_bool ecb_big_endian    (void) { return ecb_byteorder_helper () == 0x11; }
+ecb_function_ ecb_bool ecb_little_endian (void) ecb_const;
+ecb_function_ ecb_bool ecb_little_endian (void) { return ecb_byteorder_helper () == 0x44; }
+
+#if ECB_GCC_VERSION(3,0) || ECB_C99
+  #define ecb_mod(m,n) ((m) % (n) + ((m) % (n) < 0 ? (n) : 0))
+#else
+  #define ecb_mod(m,n) ((m) < 0 ? ((n) - 1 - ((-1 - (m)) % (n))) : ((m) % (n)))
+#endif
+
+#if ecb_cplusplus_does_not_suck
+  /* does not work for local types (http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2657.htm) */
+  template<typename T, int N>
+  static inline int ecb_array_length (const T (&arr)[N])
+  {
+    return N;
+  }
+#else
+  #define ecb_array_length(name) (sizeof (name) / sizeof (name [0]))
+#endif
+
+#endif
+
diff --git a/src/eio/eio.c b/src/eio/eio.c
index 9464e62c..fe027132 100644
--- a/src/eio/eio.c
+++ b/src/eio/eio.c
@@ -1,7 +1,7 @@
 /*
  * libeio implementation
  *
- * Copyright (c) 2007,2008,2009,2010 Marc Alexander Lehmann <libeio@schmorp.de>
+ * Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann <libeio@schmorp.de>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modifica-
@@ -37,16 +37,22 @@
  * either the BSD or the GPL.
  */
 
+#ifdef EIO_CONFIG_H
+# include EIO_CONFIG_H
+#endif
+
+/*  Undone by libuv for easy build scripts.
+ * #ifndef _WIN32
+ * # include "config.h"
+ * #endif
+ */
+
 #include "eio.h"
+#include "ecb.h"
 
 #ifdef EIO_STACKSIZE
 # define X_STACKSIZE EIO_STACKSIZE
 #endif
-
-// For statically-linked pthreads-w32, use:
-// #ifdef _WIN32
-// # define PTW32_STATIC_LIB 1
-// #endif
 #include "xthread.h"
 
 #include <errno.h>
@@ -60,10 +66,25 @@
 #include <fcntl.h>
 #include <assert.h>
 
-#ifndef _WIN32
-#include <sys/statvfs.h>
+/* intptr_t comes from unistd.h, says POSIX/UNIX/tradition */
+/* intptr_t only comes from stdint.h, says idiot openbsd coder */
+#if HAVE_STDINT_H
+# include <stdint.h>
 #endif
 
+#ifndef ECANCELED
+# define ECANCELED EDOM
+#endif
+#ifndef ELOOP
+# define ELOOP EDOM
+#endif
+
+#if !defined(ENOTSOCK) && defined(WSAENOTSOCK)
+# define ENOTSOCK WSAENOTSOCK
+#endif
+
+static void eio_destroy (eio_req *req);
+
 #ifndef EIO_FINISH
 # define EIO_FINISH(req)  ((req)->finish) && !EIO_CANCELLED (req) ? (req)->finish (req) : 0
 #endif
@@ -76,65 +97,158 @@
 # define EIO_FEED(req)    do { if ((req)->feed   ) (req)->feed    (req); } while (0)
 #endif
 
+#ifndef EIO_FD_TO_WIN32_HANDLE
+# define EIO_FD_TO_WIN32_HANDLE(fd) _get_osfhandle (fd)
+#endif
+#ifndef EIO_WIN32_HANDLE_TO_FD
+# define EIO_WIN32_HANDLE_TO_FD(handle) _open_osfhandle (handle, 0)
+#endif
+
+#define EIO_ERRNO(errval,retval) ((errno = errval), retval)
+
+#define EIO_ENOSYS() EIO_ERRNO (ENOSYS, -1)
+
 #ifdef _WIN32
 
-# include <errno.h>
-# include <sys/time.h>
-# include <unistd.h>
-# include <utime.h>
-# include <signal.h>
-# include <dirent.h>
-# include <windows.h>
+  #undef PAGESIZE
+  #define PAGESIZE 4096 /* GetSystemInfo? */
 
-# define ENOTSOCK WSAENOTSOCK
-# define EOPNOTSUPP WSAEOPNOTSUPP
-# define ECANCELED 140
+  /* TODO: look at how perl does stat (non-sloppy), unlink (ro-files), utime, link */
 
-# ifndef EIO_STRUCT_DIRENT
-#  define EIO_STRUCT_DIRENT struct dirent
-# endif
+  #ifdef EIO_STRUCT_STATI64
+    /* look at perl's non-sloppy stat */
+    #define stat(path,buf)       _stati64 (path,buf)
+    #define fstat(fd,buf)        _fstati64 (fd,buf)
+  #endif
+  #define lstat(path,buf)      stat (path,buf)
+  #define fsync(fd)            (FlushFileBuffers ((HANDLE)EIO_FD_TO_WIN32_HANDLE (fd)) ? 0 : EIO_ERRNO (EBADF, -1))
+  #define mkdir(path,mode)     _mkdir (path)
+  #define link(old,neu)        (CreateHardLink (neu, old, 0) ? 0 : EIO_ERRNO (ENOENT, -1))
+
+  #define chmod(path,mode)     _chmod (path, mode)
+  #define dup(fd)              _dup (fd)
+  #define dup2(fd1,fd2)        _dup2 (fd1, fd2)
+
+  #define fchmod(fd,mode)      EIO_ENOSYS ()
+  #define chown(path,uid,gid)  EIO_ENOSYS ()
+  #define fchown(fd,uid,gid)   EIO_ENOSYS ()
+  #define truncate(path,offs)  EIO_ENOSYS () /* far-miss: SetEndOfFile */
+  #define ftruncate(fd,offs)   EIO_ENOSYS () /* near-miss: SetEndOfFile */
+  #define mknod(path,mode,dev) EIO_ENOSYS ()
+  #define sync()               EIO_ENOSYS ()
+  #define readlink(path,buf,s) EIO_ENOSYS ()
+  #define statvfs(path,buf)    EIO_ENOSYS ()
+  #define fstatvfs(fd,buf)     EIO_ENOSYS ()
+
+  /* rename() uses MoveFile, which fails to overwrite */
+  #define rename(old,neu)      eio__rename (old, neu)
+
+  static int
+  eio__rename (const char *old, const char *neu)
+  {
+    if (MoveFileEx (old, neu, MOVEFILE_REPLACE_EXISTING))
+      return 0;
+
+    /* should steal _dosmaperr */
+    switch (GetLastError ())
+      {
+        case ERROR_FILE_NOT_FOUND:
+        case ERROR_PATH_NOT_FOUND:
+        case ERROR_INVALID_DRIVE:
+        case ERROR_NO_MORE_FILES:
+        case ERROR_BAD_NETPATH:
+        case ERROR_BAD_NET_NAME:
+        case ERROR_BAD_PATHNAME:
+        case ERROR_FILENAME_EXCED_RANGE:
+          errno = ENOENT;
+          break;
+
+        default:
+          errno = EACCES;
+          break;
+      }
+
+    return -1;
+  }
+
+  /* we could even stat and see if it exists */
+  static int
+  symlink (const char *old, const char *neu)
+  {
+    #if WINVER >= 0x0600
+      if (CreateSymbolicLink (neu, old, 1))
+        return 0;
+
+      if (CreateSymbolicLink (neu, old, 0))
+        return 0;
+    #endif
+
+    return EIO_ERRNO (ENOENT, -1);
+  }
+
+  /* POSIX API only */
+  #define CreateHardLink(neu,old,flags) 0
+  #define CreateSymbolicLink(neu,old,flags) 0
+
+  struct statvfs
+  {
+    int dummy;
+  };
+
+  #define DT_DIR EIO_DT_DIR
+  #define DT_REG EIO_DT_REG
+  #define D_NAME(entp) entp.cFileName
+  #define D_TYPE(entp) (entp.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY ? DT_DIR : DT_REG)
 
 #else
 
-# ifdef EIO_CONFIG_H
-#  include EIO_CONFIG_H
-# else
-#  include "config.h"
-# endif
+  #include <sys/time.h>
+  #include <sys/select.h>
+  #include <sys/statvfs.h>
+  #include <unistd.h>
+  #include <signal.h>
+  #include <dirent.h>
 
-# include <sys/time.h>
-# include <sys/select.h>
-# include <unistd.h>
+  #if _POSIX_MEMLOCK || _POSIX_MEMLOCK_RANGE || _POSIX_MAPPED_FILES
+    #include <sys/mman.h>
+  #endif
+
+  #define D_NAME(entp) entp->d_name
+
+  /* POSIX_SOURCE is useless on bsd's, and XOPEN_SOURCE is unreliable there, too */
+  #if __FreeBSD__ || defined __NetBSD__ || defined __OpenBSD__
+    #define _DIRENT_HAVE_D_TYPE /* sigh */
+    #define D_INO(de) (de)->d_fileno
+    #define D_NAMLEN(de) (de)->d_namlen
+  #elif __linux || defined d_ino || _XOPEN_SOURCE >= 600
+    #define D_INO(de) (de)->d_ino
+  #endif
+
+  #ifdef _D_EXACT_NAMLEN
+    #undef D_NAMLEN
+    #define D_NAMLEN(de) _D_EXACT_NAMLEN (de)
+  #endif
+
+  #ifdef _DIRENT_HAVE_D_TYPE
+    #define D_TYPE(de) (de)->d_type
+  #endif
+
+  #ifndef EIO_STRUCT_DIRENT
+    #define EIO_STRUCT_DIRENT struct dirent
+  #endif
+
+#endif
+
+#if HAVE_UTIMES
 # include <utime.h>
-# include <signal.h>
-# include <dirent.h>
-
-#if _POSIX_MEMLOCK || _POSIX_MEMLOCK_RANGE || _POSIX_MAPPED_FILES
-# include <sys/mman.h>
 #endif
 
-/* POSIX_SOURCE is useless on bsd's, and XOPEN_SOURCE is unreliable there, too */
-# if __FreeBSD__ || defined __NetBSD__ || defined __OpenBSD__
-#  define _DIRENT_HAVE_D_TYPE /* sigh */
-#  define D_INO(de) (de)->d_fileno
-#  define D_NAMLEN(de) (de)->d_namlen
-# elif __linux || defined d_ino || _XOPEN_SOURCE >= 600
-#  define D_INO(de) (de)->d_ino
-# endif
-
-#ifdef _D_EXACT_NAMLEN
-# undef D_NAMLEN
-# define D_NAMLEN(de) _D_EXACT_NAMLEN (de)
+#if HAVE_SYS_SYSCALL_H
+# include <sys/syscall.h>
 #endif
 
-# ifdef _DIRENT_HAVE_D_TYPE
-#  define D_TYPE(de) (de)->d_type
-# endif
-
-# ifndef EIO_STRUCT_DIRENT
-#  define EIO_STRUCT_DIRENT struct dirent
-# endif
-
+#if HAVE_SYS_PRCTL_H
+# include <sys/prctl.h>
 #endif
 
 #if HAVE_SENDFILE
@@ -159,12 +273,9 @@
 # define D_INO(de) 0
 #endif
 #ifndef D_NAMLEN
-# define D_NAMLEN(de) strlen ((de)->d_name)
+# define D_NAMLEN(entp) strlen (D_NAME (entp))
 #endif
 
-/* number of seconds after which an idle threads exit */
-#define IDLE_TIMEOUT 10
-
 /* used for struct dirent, AIX doesn't provide it */
 #ifndef NAME_MAX
 # define NAME_MAX 4096
@@ -179,29 +290,16 @@
 #define EIO_BUFSIZE 65536
 
 #define dBUF	 				\
-  char *eio_buf;				\
-  ETP_WORKER_LOCK (self);			\
-  self->dbuf = eio_buf = malloc (EIO_BUFSIZE);	\
-  ETP_WORKER_UNLOCK (self);			\
+  char *eio_buf = malloc (EIO_BUFSIZE);		\
   errno = ENOMEM;				\
   if (!eio_buf)					\
-    return -1;
+    return -1
+
+#define FUBd					\
+  free (eio_buf)
 
 #define EIO_TICKS ((1000000 + 1023) >> 10)
 
-/*****************************************************************************/
-
-#if __GNUC__ >= 3
-# define expect(expr,value) __builtin_expect ((expr),(value))
-#else
-# define expect(expr,value) (expr)
-#endif
-
-#define expect_false(expr) expect ((expr) != 0, 0)
-#define expect_true(expr)  expect ((expr) != 0, 1)
-
-/*****************************************************************************/
-
 #define ETP_PRI_MIN EIO_PRI_MIN
 #define ETP_PRI_MAX EIO_PRI_MAX
 
@@ -214,29 +312,13 @@ static int eio_finish (eio_req *req);
 static void eio_execute (struct etp_worker *self, eio_req *req);
 #define ETP_EXECUTE(wrk,req) eio_execute (wrk,req)
 
-#define ETP_WORKER_CLEAR(req)	\
-  if (wrk->dbuf)		\
-    {				\
-      free (wrk->dbuf);		\
-      wrk->dbuf = 0;		\
-    }				\
-				\
-  if (wrk->dirp)		\
-    {				\
-      closedir (wrk->dirp);	\
-      wrk->dirp = 0;		\
-    }
-
-#define ETP_WORKER_COMMON \
-  void *dbuf;	\
-  DIR *dirp;
-
 /*****************************************************************************/
 
 #define ETP_NUM_PRI (ETP_PRI_MAX - ETP_PRI_MIN + 1)
 
 /* calculate time difference in ~1/EIO_TICKS of a second */
-static int tvdiff (struct timeval *tv1, struct timeval *tv2)
+ecb_inline int
+tvdiff (struct timeval *tv1, struct timeval *tv2)
 {
   return  (tv2->tv_sec  - tv1->tv_sec ) * EIO_TICKS
        + ((tv2->tv_usec - tv1->tv_usec) >> 10);
@@ -250,19 +332,16 @@ static void (*done_poll_cb) (void);
 static unsigned int max_poll_time;     /* reslock */
 static unsigned int max_poll_reqs;     /* reslock */
 
-static volatile unsigned int nreqs;    /* reqlock */
-static volatile unsigned int nready;   /* reqlock */
-static volatile unsigned int npending; /* reqlock */
-static volatile unsigned int max_idle = 4;
+static unsigned int nreqs;    /* reqlock */
+static unsigned int nready;   /* reqlock */
+static unsigned int npending; /* reqlock */
+static unsigned int max_idle = 4;      /* maximum number of threads that can idle indefinitely */
+static unsigned int idle_timeout = 10; /* number of seconds after which an idle threads exit */
 
-static xmutex_t wrklock = X_MUTEX_INIT;
-static xmutex_t reslock = X_MUTEX_INIT;
-static xmutex_t reqlock = X_MUTEX_INIT;
-static xcond_t  reqwait = X_COND_INIT;
-
-#if defined (__APPLE__)
-static xmutex_t apple_bug_writelock = X_MUTEX_INIT;
-#endif
+static xmutex_t wrklock;
+static xmutex_t reslock;
+static xmutex_t reqlock;
+static xcond_t  reqwait;
 
 #if !HAVE_PREADWRITE
 /*
@@ -270,7 +349,7 @@ static xmutex_t apple_bug_writelock = X_MUTEX_INIT;
  * normal read/write by using a mutex. slows down execution a lot,
  * but that's your problem, not mine.
  */
-static xmutex_t preadwritelock = X_MUTEX_INIT;
+static xmutex_t preadwritelock;
 #endif
 
 typedef struct etp_worker
@@ -283,22 +362,25 @@ typedef struct etp_worker
   /* locked by reslock, reqlock or wrklock */
   ETP_REQ *req; /* currently processed request */
 
+#ifdef ETP_WORKER_COMMON
   ETP_WORKER_COMMON
+#endif
 } etp_worker;
 
-static etp_worker wrk_first = { &wrk_first, &wrk_first, 0 }; /* NOT etp */
+static etp_worker wrk_first; /* NOT etp */
 
 #define ETP_WORKER_LOCK(wrk)   X_LOCK   (wrklock)
 #define ETP_WORKER_UNLOCK(wrk) X_UNLOCK (wrklock)
 
 /* worker threads management */
 
-static void etp_worker_clear (etp_worker *wrk)
+static void ecb_cold
+etp_worker_clear (etp_worker *wrk)
 {
-  ETP_WORKER_CLEAR (wrk);
 }
 
-static void etp_worker_free (etp_worker *wrk)
+static void ecb_cold
+etp_worker_free (etp_worker *wrk)
 {
   wrk->next->prev = wrk->prev;
   wrk->prev->next = wrk->next;
@@ -306,7 +388,8 @@ static void etp_worker_free (etp_worker *wrk)
   free (wrk);
 }
 
-static unsigned int etp_nreqs (void)
+static unsigned int
+etp_nreqs (void)
 {
   int retval;
   if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
@@ -315,7 +398,8 @@ static unsigned int etp_nreqs (void)
   return retval;
 }
 
-static unsigned int etp_nready (void)
+static unsigned int
+etp_nready (void)
 {
   unsigned int retval;
 
@@ -326,7 +410,8 @@ static unsigned int etp_nready (void)
   return retval;
 }
 
-static unsigned int etp_npending (void)
+static unsigned int
+etp_npending (void)
 {
   unsigned int retval;
 
@@ -337,7 +422,8 @@ static unsigned int etp_npending (void)
   return retval;
 }
 
-static unsigned int etp_nthreads (void)
+static unsigned int
+etp_nthreads (void)
 {
   unsigned int retval;
 
@@ -361,7 +447,19 @@ typedef struct {
 static etp_reqq req_queue;
 static etp_reqq res_queue;
 
-static int reqq_push (etp_reqq *q, ETP_REQ *req)
+static void ecb_noinline ecb_cold
+reqq_init (etp_reqq *q)
+{
+  int pri;
+
+  for (pri = 0; pri < ETP_NUM_PRI; ++pri)
+    q->qs[pri] = q->qe[pri] = 0;
+
+  q->size = 0;
+}
+
+static int ecb_noinline
+reqq_push (etp_reqq *q, ETP_REQ *req)
 {
   int pri = req->pri;
   req->next = 0;
@@ -377,7 +475,8 @@ static int reqq_push (etp_reqq *q, ETP_REQ *req)
   return q->size++;
 }
 
-static ETP_REQ *reqq_shift (etp_reqq *q)
+static ETP_REQ * ecb_noinline
+reqq_shift (etp_reqq *q)
 {
   int pri;
 
@@ -402,46 +501,19 @@ static ETP_REQ *reqq_shift (etp_reqq *q)
   abort ();
 }
 
-static void etp_atfork_prepare (void)
+static int ecb_cold
+etp_init (void (*want_poll)(void), void (*done_poll)(void))
 {
-  X_LOCK (wrklock);
-  X_LOCK (reqlock);
-  X_LOCK (reslock);
-#if !HAVE_PREADWRITE
-  X_LOCK (preadwritelock);
-#endif
-}
+  X_MUTEX_CREATE (wrklock);
+  X_MUTEX_CREATE (reslock);
+  X_MUTEX_CREATE (reqlock);
+  X_COND_CREATE  (reqwait);
 
-static void etp_atfork_parent (void)
-{
-#if !HAVE_PREADWRITE
-  X_UNLOCK (preadwritelock);
-#endif
-  X_UNLOCK (reslock);
-  X_UNLOCK (reqlock);
-  X_UNLOCK (wrklock);
-}
+  reqq_init (&req_queue);
+  reqq_init (&res_queue);
 
-static void etp_atfork_child (void)
-{
-  ETP_REQ *prv;
-
-  while ((prv = reqq_shift (&req_queue)))
-    ETP_DESTROY (prv);
-
-  while ((prv = reqq_shift (&res_queue)))
-    ETP_DESTROY (prv);
-
-  while (wrk_first.next != &wrk_first)
-    {
-      etp_worker *wrk = wrk_first.next;
-
-      if (wrk->req)
-        ETP_DESTROY (wrk->req);
-
-      etp_worker_clear (wrk);
-      etp_worker_free (wrk);
-    }
+  wrk_first.next =
+  wrk_first.prev = &wrk_first;
 
   started  = 0;
   idle     = 0;
@@ -449,22 +521,6 @@ static void etp_atfork_child (void)
   nready   = 0;
   npending = 0;
 
-  etp_atfork_parent ();
-}
-
-static void
-etp_once_init (void)
-{    
-  X_THREAD_ATFORK (etp_atfork_prepare, etp_atfork_parent, etp_atfork_child);
-}
-
-static int
-etp_init (void (*want_poll)(void), void (*done_poll)(void))
-{
-  static pthread_once_t doinit = PTHREAD_ONCE_INIT;
-
-  pthread_once (&doinit, etp_once_init);
-
   want_poll_cb = want_poll;
   done_poll_cb = done_poll;
 
@@ -473,7 +529,8 @@ etp_init (void (*want_poll)(void), void (*done_poll)(void))
 
 X_THREAD_PROC (etp_proc);
 
-static void etp_start_thread (void)
+static void ecb_cold
+etp_start_thread (void)
 {
   etp_worker *wrk = calloc (1, sizeof (etp_worker));
 
@@ -496,19 +553,21 @@ static void etp_start_thread (void)
   X_UNLOCK (wrklock);
 }
 
-static void etp_maybe_start_thread (void)
+static void
+etp_maybe_start_thread (void)
 {
-  if (expect_true (etp_nthreads () >= wanted))
+  if (ecb_expect_true (etp_nthreads () >= wanted))
     return;
   
   /* todo: maybe use idle here, but might be less exact */
-  if (expect_true (0 <= (int)etp_nthreads () + (int)etp_npending () - (int)etp_nreqs ()))
+  if (ecb_expect_true (0 <= (int)etp_nthreads () + (int)etp_npending () - (int)etp_nreqs ()))
     return;
 
   etp_start_thread ();
 }
 
-static void etp_end_thread (void)
+static void ecb_cold
+etp_end_thread (void)
 {
   eio_req *req = calloc (1, sizeof (eio_req));
 
@@ -525,7 +584,8 @@ static void etp_end_thread (void)
   X_UNLOCK (wrklock);
 }
 
-static int etp_poll (void)
+static int
+etp_poll (void)
 {
   unsigned int maxreqs;
   unsigned int maxtime;
@@ -565,7 +625,7 @@ static int etp_poll (void)
       --nreqs;
       X_UNLOCK (reqlock);
 
-      if (expect_false (req->type == EIO_GROUP && req->size))
+      if (ecb_expect_false (req->type == EIO_GROUP && req->size))
         {
           req->int1 = 1; /* mark request as delayed */
           continue;
@@ -573,11 +633,11 @@ static int etp_poll (void)
       else
         {
           int res = ETP_FINISH (req);
-          if (expect_false (res))
+          if (ecb_expect_false (res))
             return res;
         }
 
-      if (expect_false (maxreqs && !--maxreqs))
+      if (ecb_expect_false (maxreqs && !--maxreqs))
         break;
 
       if (maxtime)
@@ -593,23 +653,23 @@ static int etp_poll (void)
   return -1;
 }
 
-static void etp_cancel (ETP_REQ *req)
+static void
+etp_cancel (ETP_REQ *req)
 {
-  X_LOCK   (wrklock);
-  req->flags |= EIO_FLAG_CANCELLED;
-  X_UNLOCK (wrklock);
+  req->cancelled = 1;
 
   eio_grp_cancel (req);
 }
 
-static void etp_submit (ETP_REQ *req)
+static void
+etp_submit (ETP_REQ *req)
 {
   req->pri -= ETP_PRI_MIN;
 
-  if (expect_false (req->pri < ETP_PRI_MIN - ETP_PRI_MIN)) req->pri = ETP_PRI_MIN - ETP_PRI_MIN;
-  if (expect_false (req->pri > ETP_PRI_MAX - ETP_PRI_MIN)) req->pri = ETP_PRI_MAX - ETP_PRI_MIN;
+  if (ecb_expect_false (req->pri < ETP_PRI_MIN - ETP_PRI_MIN)) req->pri = ETP_PRI_MIN - ETP_PRI_MIN;
+  if (ecb_expect_false (req->pri > ETP_PRI_MAX - ETP_PRI_MIN)) req->pri = ETP_PRI_MAX - ETP_PRI_MIN;
 
-  if (expect_false (req->type == EIO_GROUP))
+  if (ecb_expect_false (req->type == EIO_GROUP))
     {
       /* I hope this is worth it :/ */
       X_LOCK (reqlock);
@@ -638,34 +698,47 @@ static void etp_submit (ETP_REQ *req)
     }
 }
 
-static void etp_set_max_poll_time (double nseconds)
+static void ecb_cold
+etp_set_max_poll_time (double nseconds)
 {
   if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
   max_poll_time = nseconds * EIO_TICKS;
   if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 }
 
-static void etp_set_max_poll_reqs (unsigned int maxreqs)
+static void ecb_cold
+etp_set_max_poll_reqs (unsigned int maxreqs)
 {
   if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
   max_poll_reqs = maxreqs;
   if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 }
 
-static void etp_set_max_idle (unsigned int nthreads)
+static void ecb_cold
+etp_set_max_idle (unsigned int nthreads)
 {
   if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
-  max_idle = nthreads <= 0 ? 1 : nthreads;
+  max_idle = nthreads;
   if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 }
 
-static void etp_set_min_parallel (unsigned int nthreads)
+static void ecb_cold
+etp_set_idle_timeout (unsigned int seconds)
+{
+  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
+  idle_timeout = seconds;
+  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
+}
+
+static void ecb_cold
+etp_set_min_parallel (unsigned int nthreads)
 {
   if (wanted < nthreads)
     wanted = nthreads;
 }
 
-static void etp_set_max_parallel (unsigned int nthreads)
+static void ecb_cold
+etp_set_max_parallel (unsigned int nthreads)
 {
   if (wanted > nthreads)
     wanted = nthreads;
@@ -676,7 +749,8 @@ static void etp_set_max_parallel (unsigned int nthreads)
 
 /*****************************************************************************/
 
-static void grp_try_feed (eio_req *grp)
+static void
+grp_try_feed (eio_req *grp)
 {
   while (grp->size < grp->int2 && !EIO_CANCELLED (grp))
     {
@@ -693,7 +767,8 @@ static void grp_try_feed (eio_req *grp)
     }
 }
 
-static int grp_dec (eio_req *grp)
+static int
+grp_dec (eio_req *grp)
 {
   --grp->size;
 
@@ -707,7 +782,8 @@ static int grp_dec (eio_req *grp)
     return 0;
 }
 
-void eio_destroy (eio_req *req)
+static void
+eio_destroy (eio_req *req)
 {
   if ((req)->flags & EIO_FLAG_PTR1_FREE) free (req->ptr1);
   if ((req)->flags & EIO_FLAG_PTR2_FREE) free (req->ptr2);
@@ -715,7 +791,8 @@ void eio_destroy (eio_req *req)
   EIO_DESTROY (req);
 }
 
-static int eio_finish (eio_req *req)
+static int
+eio_finish (eio_req *req)
 {
   int res = EIO_FINISH (req);
 
@@ -733,7 +810,7 @@ static int eio_finish (eio_req *req)
 
       res2 = grp_dec (grp);
 
-      if (!res && res2)
+      if (!res)
         res = res2;
     }
 
@@ -742,63 +819,81 @@ static int eio_finish (eio_req *req)
   return res;
 }
 
-void eio_grp_cancel (eio_req *grp)
+void
+eio_grp_cancel (eio_req *grp)
 {
   for (grp = grp->grp_first; grp; grp = grp->grp_next)
     eio_cancel (grp);
 }
 
-void eio_cancel (eio_req *req)
+void
+eio_cancel (eio_req *req)
 {
   etp_cancel (req);
 }
 
-void eio_submit (eio_req *req)
+void
+eio_submit (eio_req *req)
 {
   etp_submit (req);
 }
 
-unsigned int eio_nreqs (void)
+unsigned int
+eio_nreqs (void)
 {
   return etp_nreqs ();
 }
 
-unsigned int eio_nready (void)
+unsigned int
+eio_nready (void)
 {
   return etp_nready ();
 }
 
-unsigned int eio_npending (void)
+unsigned int
+eio_npending (void)
 {
   return etp_npending ();
 }
 
-unsigned int eio_nthreads (void)
+unsigned int ecb_cold
+eio_nthreads (void)
 {
   return etp_nthreads ();
 }
 
-void eio_set_max_poll_time (double nseconds)
+void ecb_cold
+eio_set_max_poll_time (double nseconds)
 {
   etp_set_max_poll_time (nseconds);
 }
 
-void eio_set_max_poll_reqs (unsigned int maxreqs)
+void ecb_cold
+eio_set_max_poll_reqs (unsigned int maxreqs)
 {
   etp_set_max_poll_reqs (maxreqs);
 }
 
-void eio_set_max_idle (unsigned int nthreads)
+void ecb_cold
+eio_set_max_idle (unsigned int nthreads)
 {
   etp_set_max_idle (nthreads);
 }
 
-void eio_set_min_parallel (unsigned int nthreads)
+void ecb_cold
+eio_set_idle_timeout (unsigned int seconds)
+{
+  etp_set_idle_timeout (seconds);
+}
+
+void ecb_cold
+eio_set_min_parallel (unsigned int nthreads)
 {
   etp_set_min_parallel (nthreads);
 }
 
-void eio_set_max_parallel (unsigned int nthreads)
+void ecb_cold
+eio_set_max_parallel (unsigned int nthreads)
 {
   etp_set_max_parallel (nthreads);
 }
@@ -817,10 +912,10 @@ int eio_poll (void)
 # define pread  eio__pread
 # define pwrite eio__pwrite
 
-ssize_t
+static eio_ssize_t
 eio__pread (int fd, void *buf, size_t count, off_t offset)
 {
-  ssize_t res;
+  eio_ssize_t res;
   off_t ooffset;
 
   X_LOCK (preadwritelock);
@@ -833,10 +928,10 @@ eio__pread (int fd, void *buf, size_t count, off_t offset)
   return res;
 }
 
-ssize_t
+static eio_ssize_t
 eio__pwrite (int fd, void *buf, size_t count, off_t offset)
 {
-  ssize_t res;
+  eio_ssize_t res;
   off_t ooffset;
 
   X_LOCK (preadwritelock);
@@ -878,7 +973,8 @@ eio__utimes (const char *filename, const struct timeval times[2])
 # undef futimes
 # define futimes(fd,times) eio__futimes (fd, times)
 
-static int eio__futimes (int fd, const struct timeval tv[2])
+static int
+eio__futimes (int fd, const struct timeval tv[2])
 {
   errno = ENOSYS;
   return -1;
@@ -886,25 +982,31 @@ static int eio__futimes (int fd, const struct timeval tv[2])
 
 #endif
 
-#ifdef _WIN32
-# define fsync(fd) (FlushFileBuffers((HANDLE)_get_osfhandle(fd)) ? 0 : -1)
-#endif
-
 #if !HAVE_FDATASYNC
 # undef fdatasync
 # define fdatasync(fd) fsync (fd)
 #endif
 
-// Use unicode and big file aware stat on windows
-#ifdef _WIN32
-# undef stat
-# undef fstat
-# define stat  _stati64
-# define fstat _fstati64
+static int
+eio__syncfs (int fd)
+{
+  int res;
+
+#if HAVE_SYS_SYNCFS
+  res = (int)syscall (__NR_syncfs, (int)(fd));
+#else
+  res = -1;
+  errno = ENOSYS;
 #endif
 
+  if (res < 0 && errno == ENOSYS && fd >= 0)
+    sync ();
+
+  return res;
+}
+
 /* sync_file_range always needs emulation */
-int
+static int
 eio__sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags)
 {
 #if HAVE_SYNC_FILE_RANGE
@@ -931,11 +1033,22 @@ eio__sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags)
   return fdatasync (fd);
 }
 
+static int
+eio__fallocate (int fd, int mode, off_t offset, size_t len)
+{
+#if HAVE_FALLOCATE
+  return fallocate (fd, mode, offset, len);
+#else
+  errno = ENOSYS;
+  return -1;
+#endif
+}
+
 #if !HAVE_READAHEAD
 # undef readahead
 # define readahead(fd,offset,count) eio__readahead (fd, offset, count, self)
 
-static ssize_t
+static eio_ssize_t
 eio__readahead (int fd, off_t offset, size_t count, etp_worker *self)
 {
   size_t todo = count;
@@ -950,6 +1063,8 @@ eio__readahead (int fd, off_t offset, size_t count, etp_worker *self)
       todo   -= len;
     }
 
+  FUBd;
+
   errno = 0;
   return count;
 }
@@ -957,93 +1072,118 @@ eio__readahead (int fd, off_t offset, size_t count, etp_worker *self)
 #endif
 
 /* sendfile always needs emulation */
-static ssize_t
-eio__sendfile (int ofd, int ifd, off_t offset, size_t count, etp_worker *self)
+static eio_ssize_t
+eio__sendfile (int ofd, int ifd, off_t offset, size_t count)
 {
-  ssize_t res;
+  eio_ssize_t written = 0;
+  eio_ssize_t res;
 
   if (!count)
     return 0;
 
+  for (;;)
+    {
+#ifdef __APPLE__
+# undef HAVE_SENDFILE /* broken, as everything on os x */
+#endif
 #if HAVE_SENDFILE
 # if __linux
-  res = sendfile (ofd, ifd, &offset, count);
+      off_t soffset = offset;
+      res = sendfile (ofd, ifd, &soffset, count);
 
 # elif __FreeBSD__
-  /*
-   * Of course, the freebsd sendfile is a dire hack with no thoughts
-   * wasted on making it similar to other I/O functions.
-   */
-  {
-    off_t sbytes;
-    res = sendfile (ifd, ofd, offset, count, 0, &sbytes, 0);
+      /*
+       * Of course, the freebsd sendfile is a dire hack with no thoughts
+       * wasted on making it similar to other I/O functions.
+       */
+      off_t sbytes;
+      res = sendfile (ifd, ofd, offset, count, 0, &sbytes, 0);
 
-    #if 0 /* according to the manpage, this is correct, but broken behaviour */
-    /* freebsd' sendfile will return 0 on success */
-    /* freebsd 8 documents it as only setting *sbytes on EINTR and EAGAIN, but */
-    /* not on e.g. EIO or EPIPE - sounds broken */
-    if ((res < 0 && (errno == EAGAIN || errno == EINTR) && sbytes) || res == 0)
-      res = sbytes;
-    #endif
+      #if 0 /* according to the manpage, this is correct, but broken behaviour */
+      /* freebsd' sendfile will return 0 on success */
+      /* freebsd 8 documents it as only setting *sbytes on EINTR and EAGAIN, but */
+      /* not on e.g. EIO or EPIPE - sounds broken */
+      if ((res < 0 && (errno == EAGAIN || errno == EINTR) && sbytes) || res == 0)
+        res = sbytes;
+      #endif
 
-    /* according to source inspection, this is correct, and useful behaviour */
-    if (sbytes)
-      res = sbytes;
-  }
+      /* according to source inspection, this is correct, and useful behaviour */
+      if (sbytes)
+        res = sbytes;
 
 # elif defined (__APPLE__)
+      off_t sbytes = count;
+      res = sendfile (ifd, ofd, offset, &sbytes, 0, 0);
 
-  {
-    off_t sbytes = count;
-    res = sendfile (ifd, ofd, offset, &sbytes, 0, 0);
-
-    /* according to the manpage, sbytes is always valid */
-    if (sbytes)
-      res = sbytes;
-  }
+      /* according to the manpage, sbytes is always valid */
+      if (sbytes)
+        res = sbytes;
 
 # elif __hpux
-  res = sendfile (ofd, ifd, offset, count, 0, 0);
+      res = sendfile (ofd, ifd, offset, count, 0, 0);
 
 # elif __solaris
-  {
-    struct sendfilevec vec;
-    size_t sbytes;
+      struct sendfilevec vec;
+      size_t sbytes;
 
-    vec.sfv_fd   = ifd;
-    vec.sfv_flag = 0;
-    vec.sfv_off  = offset;
-    vec.sfv_len  = count;
+      vec.sfv_fd   = ifd;
+      vec.sfv_flag = 0;
+      vec.sfv_off  = offset;
+      vec.sfv_len  = count;
 
-    res = sendfilev (ofd, &vec, 1, &sbytes);
+      res = sendfilev (ofd, &vec, 1, &sbytes);
 
-    if (res < 0 && sbytes)
-      res = sbytes;
-  }
+      if (res < 0 && sbytes)
+        res = sbytes;
 
 # endif
 
-//#elif defined (_WIN32)
-//
-//  /* does not work, just for documentation of what would need to be done */
-//  {
-//    HANDLE h = TO_SOCKET (ifd);
-//    SetFilePointer (h, offset, 0, FILE_BEGIN);
-//    res = TransmitFile (TO_SOCKET (ofd), h, count, 0, 0, 0, 0);
-//  }
+#elif defined (_WIN32) && 0
+      /* does not work, just for documentation of what would need to be done */
+      /* actually, cannot be done like this, as TransmitFile changes the file offset, */
+      /* libeio guarantees that the file offset does not change, and windows */
+      /* has no way to get an independent handle to the same file description */
+      HANDLE h = TO_SOCKET (ifd);
+      SetFilePointer (h, offset, 0, FILE_BEGIN);
+      res = TransmitFile (TO_SOCKET (ofd), h, count, 0, 0, 0, 0);
 
 #else
-  res = -1;
-  errno = ENOSYS;
+      res = -1;
+      errno = ENOSYS;
 #endif
 
-  if (res <  0
+      /* we assume sendfile can copy at least 128mb in one go */
+      if (res <= 128 * 1024 * 1024)
+        {
+          if (res > 0)
+            written += res;
+
+          if (written)
+            return written;
+
+          break;
+        }
+      else
+        {
+          /* if we requested more, then probably the kernel was lazy */
+          written += res;
+          offset  += res;
+          count   -= res;
+
+          if (!count)
+            return written;
+        }
+    }
+
+  if (res < 0
       && (errno == ENOSYS || errno == EINVAL || errno == ENOTSOCK
           /* BSDs */
 #ifdef ENOTSUP /* sigh, if the steenking pile called openbsd would only try to at least compile posix code... */
           || errno == ENOTSUP
 #endif
+#ifdef EOPNOTSUPP /* windows */
           || errno == EOPNOTSUPP /* BSDs */
+#endif
 #if __solaris
           || errno == EAFNOSUPPORT || errno == EPROTOTYPE
 #endif
@@ -1057,7 +1197,7 @@ eio__sendfile (int ofd, int ifd, off_t offset, size_t count, etp_worker *self)
 
       while (count)
         {
-          ssize_t cnt;
+          eio_ssize_t cnt;
           
           cnt = pread (ifd, eio_buf, count > EIO_BUFSIZE ? EIO_BUFSIZE : count, offset);
 
@@ -1079,16 +1219,299 @@ eio__sendfile (int ofd, int ifd, off_t offset, size_t count, etp_worker *self)
           res    += cnt;
           count  -= cnt;
         }
+
+      FUBd;
     }
 
   return res;
 }
 
+#ifdef PAGESIZE
+# define eio_pagesize() PAGESIZE
+#else
+static intptr_t
+eio_pagesize (void)
+{
+  static intptr_t page;
+
+  if (!page)
+    page = sysconf (_SC_PAGESIZE);
+
+  return page;
+}
+#endif
+
+static void
+eio_page_align (void **addr, size_t *length)
+{
+  intptr_t mask = eio_pagesize () - 1;
+
+  /* round down addr */
+  intptr_t adj = mask & (intptr_t)*addr;
+
+  *addr   = (void *)((intptr_t)*addr - adj);
+  *length += adj;
+
+  /* round up length */
+  *length = (*length + mask) & ~mask;
+}
+
+#if !_POSIX_MEMLOCK
+# define eio__mlockall(a) EIO_ENOSYS ()
+#else
+
+static int
+eio__mlockall (int flags)
+{
+  #if __GLIBC__ == 2 && __GLIBC_MINOR__ <= 7
+    extern int mallopt (int, int);
+    mallopt (-6, 238); /* http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=473812 */
+  #endif
+
+  if (EIO_MCL_CURRENT   != MCL_CURRENT
+      || EIO_MCL_FUTURE != MCL_FUTURE)
+    {
+      flags = 0
+         | (flags & EIO_MCL_CURRENT ? MCL_CURRENT : 0)
+         | (flags & EIO_MCL_FUTURE  ? MCL_FUTURE : 0);
+    }
+
+  return mlockall (flags);
+}
+#endif
+
+#if !_POSIX_MEMLOCK_RANGE
+# define eio__mlock(a,b) EIO_ENOSYS ()
+#else
+
+static int
+eio__mlock (void *addr, size_t length)
+{
+  eio_page_align (&addr, &length);
+
+  return mlock (addr, length);
+}
+
+#endif
+
+#if !(_POSIX_MAPPED_FILES && _POSIX_SYNCHRONIZED_IO)
+# define eio__msync(a,b,c) EIO_ENOSYS ()
+#else
+
+static int
+eio__msync (void *mem, size_t len, int flags)
+{
+  eio_page_align (&mem, &len);
+
+  if (EIO_MS_ASYNC         != MS_SYNC
+      || EIO_MS_INVALIDATE != MS_INVALIDATE
+      || EIO_MS_SYNC       != MS_SYNC)
+    {
+      flags = 0
+         | (flags & EIO_MS_ASYNC      ? MS_ASYNC : 0)
+         | (flags & EIO_MS_INVALIDATE ? MS_INVALIDATE : 0)
+         | (flags & EIO_MS_SYNC       ? MS_SYNC : 0);
+    }
+
+  return msync (mem, len, flags);
+}
+
+#endif
+
+static int
+eio__mtouch (eio_req *req)
+{
+  void *mem  = req->ptr2;
+  size_t len = req->size;
+  int flags  = req->int1;
+
+  eio_page_align (&mem, &len);
+
+  {
+    intptr_t addr = (intptr_t)mem;
+    intptr_t end = addr + len;
+    intptr_t page = eio_pagesize ();
+
+    if (addr < end)
+      if (flags & EIO_MT_MODIFY) /* modify */
+        do { *((volatile sig_atomic_t *)addr) |= 0; } while ((addr += page) < len && !EIO_CANCELLED (req));
+      else
+        do { *((volatile sig_atomic_t *)addr)     ; } while ((addr += page) < len && !EIO_CANCELLED (req));
+  }
+
+  return 0;
+}
+
+/*****************************************************************************/
+/* requests implemented outside eio_execute, because they are so large */
+
+static void
+eio__realpath (eio_req *req, etp_worker *self)
+{
+  char *rel = req->ptr1;
+  char *res;
+  char *tmp1, *tmp2;
+#if SYMLOOP_MAX > 32
+  int symlinks = SYMLOOP_MAX;
+#else
+  int symlinks = 32;
+#endif
+
+  req->result = -1;
+
+  errno = EINVAL;
+  if (!rel)
+    return;
+
+  errno = ENOENT;
+  if (!*rel)
+    return;
+
+  if (!req->ptr2)
+    {
+      X_LOCK (wrklock);
+      req->flags |= EIO_FLAG_PTR2_FREE;
+      X_UNLOCK (wrklock);
+      req->ptr2 = malloc (PATH_MAX * 3);
+
+      errno = ENOMEM;
+      if (!req->ptr2)
+        return;
+    }
+
+  res  = req->ptr2;
+  tmp1 = res  + PATH_MAX;
+  tmp2 = tmp1 + PATH_MAX;
+
+#if 0 /* disabled, the musl way to do things is just too racy */
+#if __linux && defined(O_NONBLOCK) && defined(O_NOATIME)
+  /* on linux we may be able to ask the kernel */
+  {
+    int fd = open (rel, O_RDONLY | O_NONBLOCK | O_NOCTTY | O_NOATIME);
+
+    if (fd >= 0)
+      {
+        sprintf (tmp1, "/proc/self/fd/%d", fd);
+        req->result = readlink (tmp1, res, PATH_MAX);
+        close (fd);
+
+        /* here we should probably stat the open file and the disk file, to make sure they still match */
+
+        if (req->result > 0)
+          goto done;
+      }
+    else if (errno == ELOOP || errno == ENAMETOOLONG || errno == ENOENT || errno == ENOTDIR || errno == EIO)
+      return;
+  }
+#endif
+#endif
+
+  if (*rel != '/')
+    {
+      if (!getcwd (res, PATH_MAX))
+        return;
+
+      if (res [1]) /* only use if not / */
+        res += strlen (res);
+    }
+
+  while (*rel)
+    {
+      eio_ssize_t len, linklen;
+      char *beg = rel;
+
+      while (*rel && *rel != '/')
+        ++rel;
+
+      len = rel - beg;
+
+      if (!len) /* skip slashes */
+        {
+          ++rel;
+          continue;
+        }
+
+      if (beg [0] == '.')
+        {
+          if (len == 1)
+            continue; /* . - nop */
+
+          if (beg [1] == '.' && len == 2)
+            {
+              /* .. - back up one component, if possible */
+
+              while (res != req->ptr2)
+                if (*--res == '/')
+                  break;
+
+              continue;
+            }
+        }
+
+        errno = ENAMETOOLONG;
+        if (res + 1 + len + 1 >= tmp1)
+          return;
+
+        /* copy one component */
+        *res = '/';
+        memcpy (res + 1, beg, len);
+
+        /* zero-terminate, for readlink */
+        res [len + 1] = 0;
+
+        /* now check if it's a symlink */
+        linklen = readlink (req->ptr2, tmp1, PATH_MAX);
+
+        if (linklen < 0)
+          {
+            if (errno != EINVAL)
+              return;
+
+            /* it's a normal directory. hopefully */
+            res += len + 1;
+          }
+        else
+          {
+            /* yay, it was a symlink - build new path in tmp2 */
+            int rellen = strlen (rel);
+
+            errno = ENAMETOOLONG;
+            if (linklen + 1 + rellen >= PATH_MAX)
+              return;
+
+            errno = ELOOP;
+            if (!--symlinks)
+              return;
+
+            if (*tmp1 == '/')
+              res = req->ptr2; /* symlink resolves to an absolute path */
+
+            /* we need to be careful, as rel might point into tmp2 already */
+            memmove (tmp2 + linklen + 1, rel, rellen + 1);
+            tmp2 [linklen] = '/';
+            memcpy (tmp2, tmp1, linklen);
+
+            rel = tmp2;
+          }
+    }
+
+  /* special case for the lone root path */
+  if (res == req->ptr2)
+    *res++ = '/';
+
+  req->result = res - (char *)req->ptr2;
+
+done:
+  req->ptr2 = realloc (req->ptr2, req->result); /* trade time for space savings */
+}
+
 static signed char
 eio_dent_cmp (const eio_dirent *a, const eio_dirent *b)
 {
-    return a->score - b->score ? a->score - b->score /* works because our signed char is always 0..100 */
-              : a->inode < b->inode ? -1 : a->inode > b->inode ? 1 : 0;
+  return a->score - b->score ? a->score - b->score /* works because our signed char is always 0..100 */
+       : a->inode < b->inode ? -1
+       : a->inode > b->inode ?  1
+       :                        0;
 }
 
 #define EIO_DENT_CMP(i,op,j) eio_dent_cmp (&i, &j) op 0
@@ -1097,41 +1520,41 @@ eio_dent_cmp (const eio_dirent *a, const eio_dirent *b)
 #define EIO_SORT_FAST   60 /* when to only use insertion sort */
 
 static void
-eio_dent_radix_sort (eio_dirent *dents, int size, signed char score_bits, ino_t inode_bits)
+eio_dent_radix_sort (eio_dirent *dents, int size, signed char score_bits, eio_ino_t inode_bits)
 {
-  unsigned char bits [9 + sizeof (ino_t) * 8];
+  unsigned char bits [9 + sizeof (eio_ino_t) * 8];
   unsigned char *bit = bits;
 
   assert (CHAR_BIT == 8);
   assert (sizeof (eio_dirent) * 8 < 256);
-  assert (offsetof (eio_dirent, inode)); /* we use 0 as sentinel */
-  assert (offsetof (eio_dirent, score)); /* we use 0 as sentinel */
+  assert (offsetof (eio_dirent, inode)); /* we use bit #0 as sentinel */
+  assert (offsetof (eio_dirent, score)); /* we use bit #0 as sentinel */
 
   if (size <= EIO_SORT_FAST)
     return;
 
   /* first prepare an array of bits to test in our radix sort */
-  /* try to take endianness into account, as well as differences in ino_t sizes */
+  /* try to take endianness into account, as well as differences in eio_ino_t sizes */
   /* inode_bits must contain all inodes ORed together */
   /* which is used to skip bits that are 0 everywhere, which is very common */
   {
-    ino_t endianness;
+    eio_ino_t endianness;
     int i, j;
 
     /* we store the byte offset of byte n into byte n of "endianness" */
-    for (i = 0; i < sizeof (ino_t); ++i)
+    for (i = 0; i < sizeof (eio_ino_t); ++i)
       ((unsigned char *)&endianness)[i] = i;
 
     *bit++ = 0;
 
-    for (i = 0; i < sizeof (ino_t); ++i)
+    for (i = 0; i < sizeof (eio_ino_t); ++i)
       {
         /* shifting off the byte offsets out of "endianness" */
         int offs = (offsetof (eio_dirent, inode) + (endianness & 0xff)) * 8;
         endianness >>= 8;
 
         for (j = 0; j < 8; ++j)
-          if (inode_bits & (((ino_t)1) << (i * 8 + j)))
+          if (inode_bits & (((eio_ino_t)1) << (i * 8 + j)))
             *bit++ = offs + j;
       }
 
@@ -1142,9 +1565,9 @@ eio_dent_radix_sort (eio_dirent *dents, int size, signed char score_bits, ino_t
 
   /* now actually do the sorting (a variant of MSD radix sort) */
   {
-    eio_dirent    *base_stk [9 + sizeof (ino_t) * 8], *base;
-    eio_dirent    *end_stk  [9 + sizeof (ino_t) * 8], *end;
-    unsigned char *bit_stk  [9 + sizeof (ino_t) * 8];
+    eio_dirent    *base_stk [9 + sizeof (eio_ino_t) * 8], *base;
+    eio_dirent    *end_stk  [9 + sizeof (eio_ino_t) * 8], *end;
+    unsigned char *bit_stk  [9 + sizeof (eio_ino_t) * 8];
     int stk_idx = 0;
 
     base_stk [stk_idx] = dents;
@@ -1233,7 +1656,7 @@ eio_dent_insertion_sort (eio_dirent *dents, int size)
 }
 
 static void
-eio_dent_sort (eio_dirent *dents, int size, signed char score_bits, ino_t inode_bits)
+eio_dent_sort (eio_dirent *dents, int size, signed char score_bits, eio_ino_t inode_bits)
 {
   if (size <= 1)
     return; /* our insertion sort relies on size > 0 */
@@ -1251,25 +1674,74 @@ eio_dent_sort (eio_dirent *dents, int size, signed char score_bits, ino_t inode_
 static void
 eio__scandir (eio_req *req, etp_worker *self)
 {
-  DIR *dirp;
-  EIO_STRUCT_DIRENT *entp;
   char *name, *names;
-  int namesalloc = 4096;
+  int namesalloc = 4096 - sizeof (void *) * 4;
   int namesoffs = 0;
   int flags = req->int1;
   eio_dirent *dents = 0;
   int dentalloc = 128;
   int dentoffs = 0;
-  ino_t inode_bits = 0;
+  eio_ino_t inode_bits = 0;
+#ifdef _WIN32
+  HANDLE dirp;
+  WIN32_FIND_DATA entp;
+#else
+  DIR *dirp;
+  EIO_STRUCT_DIRENT *entp;
+#endif
 
   req->result = -1;
 
   if (!(flags & EIO_READDIR_DENTS))
     flags &= ~(EIO_READDIR_DIRS_FIRST | EIO_READDIR_STAT_ORDER);
 
-  X_LOCK (wrklock);
-  /* the corresponding closedir is in ETP_WORKER_CLEAR */
-  self->dirp = dirp = opendir (req->ptr1);
+#ifdef _WIN32
+  {
+    int len = strlen ((const char *)req->ptr1);
+    char *path = malloc (MAX_PATH);
+    const char *fmt;
+
+    if (!len)
+      fmt = "./*";
+    else if (((const char *)req->ptr1)[len - 1] == '/' || ((const char *)req->ptr1)[len - 1] == '\\')
+      fmt = "%s*";
+    else
+      fmt = "%s/*";
+
+    _snprintf (path, MAX_PATH, fmt, (const char *)req->ptr1);
+    dirp = FindFirstFile (path, &entp);
+    free (path);
+
+    if (dirp == INVALID_HANDLE_VALUE)
+     {
+       dirp = 0;
+
+        /* should steal _dosmaperr */
+        switch (GetLastError ())
+          {
+            case ERROR_FILE_NOT_FOUND:
+              req->result = 0;
+              break;
+
+            case ERROR_INVALID_NAME:
+            case ERROR_PATH_NOT_FOUND:
+            case ERROR_NO_MORE_FILES:
+              errno = ENOENT;
+              break;
+
+            case ERROR_NOT_ENOUGH_MEMORY:
+              errno = ENOMEM;
+              break;
+
+            default:
+              errno = EINVAL;
+              break;
+          }
+     }
+  }
+#else
+  dirp = opendir (req->ptr1);
+#endif
 
   if (req->flags & EIO_FLAG_PTR1_FREE)
     free (req->ptr1);
@@ -1277,25 +1749,37 @@ eio__scandir (eio_req *req, etp_worker *self)
   req->flags |= EIO_FLAG_PTR1_FREE | EIO_FLAG_PTR2_FREE;
   req->ptr1 = dents = flags ? malloc (dentalloc * sizeof (eio_dirent)) : 0;
   req->ptr2 = names = malloc (namesalloc);
-  X_UNLOCK (wrklock);
 
   if (dirp && names && (!flags || dents))
     for (;;)
       {
+        int done;
+
+#ifdef _WIN32
+        done = !dirp;
+#else
         errno = 0;
         entp = readdir (dirp);
+        done = !entp;
+#endif
 
-        if (!entp)
+        if (done)
           {
+#ifndef _WIN32
+            int old_errno = errno;
+            closedir (dirp);
+            errno = old_errno;
+
             if (errno)
               break;
+#endif
 
             /* sort etc. */
             req->int1   = flags;
             req->result = dentoffs;
 
             if (flags & EIO_READDIR_STAT_ORDER)
-              eio_dent_sort (dents, dentoffs, 0, inode_bits); /* sort by inode exclusively */
+              eio_dent_sort (dents, dentoffs, flags & EIO_READDIR_DIRS_FIRST ? 7 : 0, inode_bits);
             else if (flags & EIO_READDIR_DIRS_FIRST)
               if (flags & EIO_READDIR_FOUND_UNKNOWN)
                 eio_dent_sort (dents, dentoffs, 7, inode_bits); /* sort by score and inode */
@@ -1307,7 +1791,6 @@ eio__scandir (eio_req *req, etp_worker *self)
 
                   /* now partition dirs to the front, and non-dirs to the back */
                   /* by walking from both sides and swapping if necessary */
-                  /* also clear score, so it doesn't influence sorting */
                   while (oth > dir)
                     {
                       if (dir->type == EIO_DT_DIR)
@@ -1320,7 +1803,7 @@ eio__scandir (eio_req *req, etp_worker *self)
                         }
                     }
 
-                  /* now sort the dirs only */
+                  /* now sort the dirs only (dirs all have the same score) */
                   eio_dent_sort (dents, dir - dents, 0, inode_bits);
                 }
 
@@ -1328,19 +1811,17 @@ eio__scandir (eio_req *req, etp_worker *self)
           }
 
         /* now add the entry to our list(s) */
-        name = entp->d_name;
+        name = D_NAME (entp);
 
         /* skip . and .. entries */
         if (name [0] != '.' || (name [1] && (name [1] != '.' || name [2])))
           {
             int len = D_NAMLEN (entp) + 1;
 
-            while (expect_false (namesoffs + len > namesalloc))
+            while (ecb_expect_false (namesoffs + len > namesalloc))
               {
                 namesalloc *= 2;
-                X_LOCK (wrklock);
                 req->ptr2 = names = realloc (names, namesalloc);
-                X_UNLOCK (wrklock);
 
                 if (!names)
                   break;
@@ -1352,12 +1833,10 @@ eio__scandir (eio_req *req, etp_worker *self)
               {
                 struct eio_dirent *ent;
 
-                if (expect_false (dentoffs == dentalloc))
+                if (ecb_expect_false (dentoffs == dentalloc))
                   {
                     dentalloc *= 2;
-                    X_LOCK (wrklock);
                     req->ptr1 = dents = realloc (dents, dentalloc * sizeof (eio_dirent));
-                    X_UNLOCK (wrklock);
 
                     if (!dents)
                       break;
@@ -1447,133 +1926,17 @@ eio__scandir (eio_req *req, etp_worker *self)
             errno = ECANCELED;
             break;
           }
+
+#ifdef _WIN32
+        if (!FindNextFile (dirp, &entp))
+          {
+            FindClose (dirp);
+            dirp = 0;
+          }
+#endif
       }
 }
 
-#ifdef PAGESIZE
-# define eio_pagesize() PAGESIZE
-
-#elif defined(_WIN32)
-  /* Windows */
-  static intptr_t
-  eio_pagesize (void)
-  { 
-    SYSTEM_INFO si;
-    GetSystemInfo(&si);
-    return si.dwPageSize;
-  }
-
-#else
-  /* POSIX */
-  static intptr_t
-  eio_pagesize (void)
-  {
-    static intptr_t page;
-
-    if (!page)
-      page = sysconf (_SC_PAGESIZE);
-
-    return page;
-  }
-#endif
-
-static void
-eio_page_align (void **addr, size_t *length)
-{
-  intptr_t mask = eio_pagesize () - 1;
-
-  /* round down addr */
-  intptr_t adj = mask & (intptr_t)*addr;
-
-  *addr   = (void *)((intptr_t)*addr - adj);
-  *length += adj;
-
-  /* round up length */
-  *length = (*length + mask) & ~mask;
-}
-
-#if !_POSIX_MEMLOCK
-# define eio__mlockall(a) ((errno = ENOSYS), -1)
-#else
-
-static int
-eio__mlockall (int flags)
-{
-  #if __GLIBC__ == 2 && __GLIBC_MINOR__ <= 7
-    extern int mallopt (int, int);
-    mallopt (-6, 238); /* http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=473812 */
-  #endif
-
-  if (EIO_MCL_CURRENT   != MCL_CURRENT
-      || EIO_MCL_FUTURE != MCL_FUTURE)
-    {
-      flags = 0
-         | (flags & EIO_MCL_CURRENT ? MCL_CURRENT : 0)
-         | (flags & EIO_MCL_FUTURE  ? MCL_FUTURE : 0);
-    }
-
-  return mlockall (flags);
-}
-#endif
-
-#if !_POSIX_MEMLOCK_RANGE
-# define eio__mlock(a,b) ((errno = ENOSYS), -1)
-#else
-
-static int
-eio__mlock (void *addr, size_t length)
-{
-  eio_page_align (&addr, &length);
-
-  return mlock (addr, length);
-}
-
-#endif
-
-#if !(_POSIX_MAPPED_FILES && _POSIX_SYNCHRONIZED_IO)
-# define eio__msync(a,b,c) ((errno = ENOSYS), -1)
-#else
-
-int
-eio__msync (void *mem, size_t len, int flags)
-{
-  eio_page_align (&mem, &len);
-
-  if (EIO_MS_ASYNC         != MS_SYNC
-      || EIO_MS_INVALIDATE != MS_INVALIDATE
-      || EIO_MS_SYNC       != MS_SYNC)
-    {
-      flags = 0
-         | (flags & EIO_MS_ASYNC      ? MS_ASYNC : 0)
-         | (flags & EIO_MS_INVALIDATE ? MS_INVALIDATE : 0)
-         | (flags & EIO_MS_SYNC       ? MS_SYNC : 0);
-    }
-
-  return msync (mem, len, flags);
-}
-
-#endif
-
-int
-eio__mtouch (void *mem, size_t len, int flags)
-{
-  eio_page_align (&mem, &len);
-
-  {
-    intptr_t addr = (intptr_t)mem;
-    intptr_t end = addr + len;
-    intptr_t page = eio_pagesize ();
-
-    if (addr < end)
-      if (flags & EIO_MT_MODIFY) /* modify */
-        do { *((volatile sig_atomic_t *)addr) |= 0; } while ((addr += page) < len);
-      else
-        do { *((volatile sig_atomic_t *)addr)     ; } while ((addr += page) < len);
-  }
-
-  return 0;
-}
-
 /*****************************************************************************/
 
 #define ALLOC(len)				\
@@ -1597,11 +1960,17 @@ X_THREAD_PROC (etp_proc)
   struct timespec ts;
   etp_worker *self = (etp_worker *)thr_arg;
 
-  /* try to distribute timeouts somewhat randomly */
+#if HAVE_PRCTL_SET_NAME
+  prctl (PR_SET_NAME, (unsigned long)"eio_thread", 0, 0, 0);
+#endif
+
+  /* try to distribute timeouts somewhat evenly */
   ts.tv_nsec = ((unsigned long)self & 1023UL) * (1000000000UL / 1024UL);
 
   for (;;)
     {
+      ts.tv_sec = 0;
+
       X_LOCK (reqlock);
 
       for (;;)
@@ -1611,23 +1980,28 @@ X_THREAD_PROC (etp_proc)
           if (req)
             break;
 
+          if (ts.tv_sec == 1) /* no request, but timeout detected, let's quit */
+            {
+              X_UNLOCK (reqlock);
+              X_LOCK (wrklock);
+              --started;
+              X_UNLOCK (wrklock);
+              goto quit;
+            }
+
           ++idle;
 
-          ts.tv_sec = time (0) + IDLE_TIMEOUT;
-          if (X_COND_TIMEDWAIT (reqwait, reqlock, ts) == ETIMEDOUT)
+          if (idle <= max_idle)
+            /* we are allowed to idle, so do so without any timeout */
+            X_COND_WAIT (reqwait, reqlock);
+          else
             {
-              if (idle > max_idle)
-                {
-                  --idle;
-                  X_UNLOCK (reqlock);
-                  X_LOCK (wrklock);
-                  --started;
-                  X_UNLOCK (wrklock);
-                  goto quit;
-                }
+              /* initialise timeout once */
+              if (!ts.tv_sec)
+                ts.tv_sec = time (0) + idle_timeout;
 
-              /* we are allowed to idle, so do so without any timeout */
-              X_COND_WAIT (reqwait, reqlock);
+              if (X_COND_TIMEDWAIT (reqwait, reqlock, ts) == ETIMEDOUT)
+                ts.tv_sec = 1; /* assuming this is not a value computed above.,.. */
             }
 
           --idle;
@@ -1640,8 +2014,7 @@ X_THREAD_PROC (etp_proc)
       if (req->type < 0)
         goto quit;
 
-      if (!EIO_CANCELLED (req))
-        ETP_EXECUTE (self, req);
+      ETP_EXECUTE (self, req);
 
       X_LOCK (reslock);
 
@@ -1666,12 +2039,18 @@ quit:
 
 /*****************************************************************************/
 
-int eio_init (void (*want_poll)(void), void (*done_poll)(void))
+int ecb_cold
+eio_init (void (*want_poll)(void), void (*done_poll)(void))
 {
+#if !HAVE_PREADWRITE
+  X_MUTEX_CREATE (preadwritelock);
+#endif
+
   return etp_init (want_poll, done_poll);
 }
 
-static void eio_api_destroy (eio_req *req)
+ecb_inline void
+eio_api_destroy (eio_req *req)
 {
   free (req);
 }
@@ -1700,41 +2079,36 @@ static void eio_api_destroy (eio_req *req)
       return 0;							\
     }
 
-static void eio_execute (etp_worker *self, eio_req *req)
+static void
+eio_execute (etp_worker *self, eio_req *req)
 {
+  if (ecb_expect_false (EIO_CANCELLED (req)))
+    {
+      req->result  = -1;
+      req->errorno = ECANCELED;
+      return;
+    }
+
   switch (req->type)
     {
       case EIO_READ:      ALLOC (req->size);
                           req->result = req->offs >= 0
                                       ? pread     (req->int1, req->ptr2, req->size, req->offs)
                                       : read      (req->int1, req->ptr2, req->size); break;
-      case EIO_WRITE:
-#if defined (__APPLE__)
-                          pthread_mutex_lock (&apple_bug_writelock);
-#endif
-
-                          req->result = req->offs >= 0
+      case EIO_WRITE:     req->result = req->offs >= 0
                                       ? pwrite    (req->int1, req->ptr2, req->size, req->offs)
-                                      : write     (req->int1, req->ptr2, req->size);
-
-#if defined (__APPLE__)
-                          pthread_mutex_unlock (&apple_bug_writelock);
-#endif
-                          break;
+                                      : write     (req->int1, req->ptr2, req->size); break;
 
       case EIO_READAHEAD: req->result = readahead     (req->int1, req->offs, req->size); break;
-      case EIO_SENDFILE:  req->result = eio__sendfile (req->int1, req->int2, req->offs, req->size, self); break;
+      case EIO_SENDFILE:  req->result = eio__sendfile (req->int1, req->int2, req->offs, req->size); break;
 
       case EIO_STAT:      ALLOC (sizeof (EIO_STRUCT_STAT));
                           req->result = stat      (req->ptr1, (EIO_STRUCT_STAT *)req->ptr2); break;
-#ifndef _WIN32
       case EIO_LSTAT:     ALLOC (sizeof (EIO_STRUCT_STAT));
                           req->result = lstat     (req->ptr1, (EIO_STRUCT_STAT *)req->ptr2); break;
-#endif
       case EIO_FSTAT:     ALLOC (sizeof (EIO_STRUCT_STAT));
                           req->result = fstat     (req->int1, (EIO_STRUCT_STAT *)req->ptr2); break;
 
-#ifndef _WIN32
       case EIO_STATVFS:   ALLOC (sizeof (EIO_STRUCT_STATVFS));
                           req->result = statvfs   (req->ptr1, (EIO_STRUCT_STATVFS *)req->ptr2); break;
       case EIO_FSTATVFS:  ALLOC (sizeof (EIO_STRUCT_STATVFS));
@@ -1742,12 +2116,9 @@ static void eio_execute (etp_worker *self, eio_req *req)
 
       case EIO_CHOWN:     req->result = chown     (req->ptr1, req->int2, req->int3); break;
       case EIO_FCHOWN:    req->result = fchown    (req->int1, req->int2, req->int3); break;
-#endif
       case EIO_CHMOD:     req->result = chmod     (req->ptr1, (mode_t)req->int2); break;
-#ifndef _WIN32
       case EIO_FCHMOD:    req->result = fchmod    (req->int1, (mode_t)req->int2); break;
       case EIO_TRUNCATE:  req->result = truncate  (req->ptr1, req->offs); break;
-#endif
       case EIO_FTRUNCATE: req->result = ftruncate (req->int1, req->offs); break;
 
       case EIO_OPEN:      req->result = open      (req->ptr1, req->int1, (mode_t)req->int2); break;
@@ -1755,33 +2126,27 @@ static void eio_execute (etp_worker *self, eio_req *req)
       case EIO_DUP2:      req->result = dup2      (req->int1, req->int2); break;
       case EIO_UNLINK:    req->result = unlink    (req->ptr1); break;
       case EIO_RMDIR:     req->result = rmdir     (req->ptr1); break;
-#ifdef _WIN32
-      case EIO_MKDIR:     req->result = mkdir     (req->ptr1); break;
-#else
       case EIO_MKDIR:     req->result = mkdir     (req->ptr1, (mode_t)req->int2); break;
-#endif
       case EIO_RENAME:    req->result = rename    (req->ptr1, req->ptr2); break;
-#ifndef _WIN32
       case EIO_LINK:      req->result = link      (req->ptr1, req->ptr2); break;
       case EIO_SYMLINK:   req->result = symlink   (req->ptr1, req->ptr2); break;
-      case EIO_MKNOD:     req->result = mknod     (req->ptr1, (mode_t)req->int2, (dev_t)req->int3); break;
-#endif
+      case EIO_MKNOD:     req->result = mknod     (req->ptr1, (mode_t)req->int2, (dev_t)req->offs); break;
+
+      case EIO_REALPATH:  eio__realpath (req, self); break;
 
-#ifndef _WIN32
       case EIO_READLINK:  ALLOC (PATH_MAX);
                           req->result = readlink  (req->ptr1, req->ptr2, PATH_MAX); break;
-#endif
 
-#ifndef _WIN32
       case EIO_SYNC:      req->result = 0; sync (); break;
-#endif
       case EIO_FSYNC:     req->result = fsync     (req->int1); break;
       case EIO_FDATASYNC: req->result = fdatasync (req->int1); break;
+      case EIO_SYNCFS:    req->result = eio__syncfs (req->int1); break;
+      case EIO_SYNC_FILE_RANGE: req->result = eio__sync_file_range (req->int1, req->offs, req->size, req->int2); break;
       case EIO_MSYNC:     req->result = eio__msync (req->ptr2, req->size, req->int1); break;
-      case EIO_MTOUCH:    req->result = eio__mtouch (req->ptr2, req->size, req->int1); break;
+      case EIO_MTOUCH:    req->result = eio__mtouch (req); break;
       case EIO_MLOCK:     req->result = eio__mlock (req->ptr2, req->size); break;
       case EIO_MLOCKALL:  req->result = eio__mlockall (req->int1); break;
-      case EIO_SYNC_FILE_RANGE: req->result = eio__sync_file_range (req->int1, req->offs, req->size, req->int2); break;
+      case EIO_FALLOCATE: req->result = eio__fallocate (req->int1, req->int2, req->offs, req->size); break;
 
       case EIO_READDIR:   eio__scandir (req, self); break;
 
@@ -1832,7 +2197,7 @@ static void eio_execute (etp_worker *self, eio_req *req)
         break;
 
       case EIO_CUSTOM:
-        ((void (*)(eio_req *))req->feed) (req);
+        req->feed (req);
         break;
 
       default:
@@ -1871,6 +2236,21 @@ eio_req *eio_msync (void *addr, size_t length, int flags, int pri, eio_cb cb, vo
   REQ (EIO_MSYNC); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND;
 }
 
+eio_req *eio_fdatasync (int fd, int pri, eio_cb cb, void *data)
+{
+  REQ (EIO_FDATASYNC); req->int1 = fd; SEND;
+}
+
+eio_req *eio_syncfs (int fd, int pri, eio_cb cb, void *data)
+{
+  REQ (EIO_SYNCFS); req->int1 = fd; SEND;
+}
+
+eio_req *eio_sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags, int pri, eio_cb cb, void *data)
+{
+  REQ (EIO_SYNC_FILE_RANGE); req->int1 = fd; req->offs = offset; req->size = nbytes; req->int2 = flags; SEND;
+}
+
 eio_req *eio_mtouch (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data)
 {
   REQ (EIO_MTOUCH); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND;
@@ -1886,14 +2266,9 @@ eio_req *eio_mlockall (int flags, int pri, eio_cb cb, void *data)
   REQ (EIO_MLOCKALL); req->int1 = flags; SEND;
 }
 
-eio_req *eio_sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags, int pri, eio_cb cb, void *data)
+eio_req *eio_fallocate (int fd, int mode, off_t offset, size_t len, int pri, eio_cb cb, void *data)
 {
-  REQ (EIO_SYNC_FILE_RANGE); req->int1 = fd; req->offs = offset; req->size = nbytes; req->int2 = flags; SEND;
-}
-
-eio_req *eio_fdatasync (int fd, int pri, eio_cb cb, void *data)
-{
-  REQ (EIO_FDATASYNC); req->int1 = fd; SEND;
+  REQ (EIO_FALLOCATE); req->int1 = fd; req->int2 = mode; req->offs = offset; req->size = len; SEND;
 }
 
 eio_req *eio_close (int fd, int pri, eio_cb cb, void *data)
@@ -1941,7 +2316,7 @@ eio_req *eio_fchmod (int fd, mode_t mode, int pri, eio_cb cb, void *data)
   REQ (EIO_FCHMOD); req->int1 = fd; req->int2 = (long)mode; SEND;
 }
 
-eio_req *eio_fchown (int fd, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data)
+eio_req *eio_fchown (int fd, eio_uid_t uid, eio_gid_t gid, int pri, eio_cb cb, void *data)
 {
   REQ (EIO_FCHOWN); req->int1 = fd; req->int2 = (long)uid; req->int3 = (long)gid; SEND;
 }
@@ -1971,7 +2346,7 @@ eio_req *eio_truncate (const char *path, off_t offset, int pri, eio_cb cb, void
   REQ (EIO_TRUNCATE); PATH; req->offs = offset; SEND;
 }
 
-eio_req *eio_chown (const char *path, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data)
+eio_req *eio_chown (const char *path, eio_uid_t uid, eio_gid_t gid, int pri, eio_cb cb, void *data)
 {
   REQ (EIO_CHOWN); PATH; req->int2 = (long)uid; req->int3 = (long)gid; SEND;
 }
@@ -1997,6 +2372,11 @@ eio_req *eio_readlink (const char *path, int pri, eio_cb cb, void *data)
   return eio__1path (EIO_READLINK, path, pri, cb, data);
 }
 
+eio_req *eio_realpath (const char *path, int pri, eio_cb cb, void *data)
+{
+  return eio__1path (EIO_REALPATH, path, pri, cb, data);
+}
+
 eio_req *eio_stat (const char *path, int pri, eio_cb cb, void *data)
 {
   return eio__1path (EIO_STAT, path, pri, cb, data);
@@ -2029,7 +2409,7 @@ eio_req *eio_readdir (const char *path, int flags, int pri, eio_cb cb, void *dat
 
 eio_req *eio_mknod (const char *path, mode_t mode, dev_t dev, int pri, eio_cb cb, void *data)
 {
-  REQ (EIO_MKNOD); PATH; req->int2 = (long)mode; req->int3 = (long)dev; SEND;
+  REQ (EIO_MKNOD); PATH; req->int2 = (long)mode; req->offs = (off_t)dev; SEND;
 }
 
 static eio_req *
@@ -2063,9 +2443,9 @@ eio_req *eio_rename (const char *path, const char *new_path, int pri, eio_cb cb,
   return eio__2path (EIO_RENAME, path, new_path, pri, cb, data);
 }
 
-eio_req *eio_custom (eio_cb execute, int pri, eio_cb cb, void *data)
+eio_req *eio_custom (void (*execute)(eio_req *), int pri, eio_cb cb, void *data)
 {
-  REQ (EIO_CUSTOM); req->feed = (void (*)(eio_req *))execute; SEND;
+  REQ (EIO_CUSTOM); req->feed = execute; SEND;
 }
 
 #endif
@@ -2084,7 +2464,8 @@ eio_req *eio_grp (eio_cb cb, void *data)
 /*****************************************************************************/
 /* grp functions */
 
-void eio_grp_feed (eio_req *grp, void (*feed)(eio_req *req), int limit)
+void
+eio_grp_feed (eio_req *grp, void (*feed)(eio_req *req), int limit)
 {
   grp->int2 = limit;
   grp->feed = feed;
@@ -2092,14 +2473,16 @@ void eio_grp_feed (eio_req *grp, void (*feed)(eio_req *req), int limit)
   grp_try_feed (grp);
 }
 
-void eio_grp_limit (eio_req *grp, int limit)
+void
+eio_grp_limit (eio_req *grp, int limit)
 {
   grp->int2 = limit;
 
   grp_try_feed (grp);
 }
 
-void eio_grp_add (eio_req *grp, eio_req *req)
+void
+eio_grp_add (eio_req *grp, eio_req *req)
 {
   assert (("cannot add requests to IO::AIO::GRP after the group finished", grp->int1 != 2));
 
@@ -2120,18 +2503,9 @@ void eio_grp_add (eio_req *grp, eio_req *req)
 /*****************************************************************************/
 /* misc garbage */
 
-ssize_t eio_sendfile_sync (int ofd, int ifd, off_t offset, size_t count)
+eio_ssize_t
+eio_sendfile_sync (int ofd, int ifd, off_t offset, size_t count)
 {
-  etp_worker wrk;
-  ssize_t ret;
-
-  wrk.dbuf = 0;
-
-  ret = eio__sendfile (ofd, ifd, offset, count, &wrk);
-
-  if (wrk.dbuf)
-    free (wrk.dbuf);
-
-  return ret;
+  return eio__sendfile (ofd, ifd, offset, count);
 }
 
diff --git a/src/eio/eio.pod b/src/eio/eio.pod
index c83768eb..da5f33c0 100644
--- a/src/eio/eio.pod
+++ b/src/eio/eio.pod
@@ -13,14 +13,14 @@ web page you might find easier to navigate when reading it for the first
 time: L<http://pod.tst.eu/http://cvs.schmorp.de/libeio/eio.pod>.
 
 Note that this library is a by-product of the C<IO::AIO> perl
-module, and many of the subtler points regarding requets lifetime
+module, and many of the subtler points regarding requests lifetime
 and so on are only documented in its documentation at the
 moment: L<http://pod.tst.eu/http://cvs.schmorp.de/IO-AIO/AIO.pm>.
 
 =head2 FEATURES
 
 This library provides fully asynchronous versions of most POSIX functions
-dealign with I/O. Unlike most asynchronous libraries, this not only
+dealing with I/O. Unlike most asynchronous libraries, this not only
 includes C<read> and C<write>, but also C<open>, C<stat>, C<unlink> and
 similar functions, as well as less rarely ones such as C<mknod>, C<futime>
 or C<readlink>.
@@ -39,7 +39,7 @@ C<readdir>.
 Libeio represents time as a single floating point number, representing the
 (fractional) number of seconds since the (POSIX) epoch (somewhere near
 the beginning of 1970, details are complicated, don't ask). This type is
-called C<eio_tstamp>, but it is guarenteed to be of type C<double> (or
+called C<eio_tstamp>, but it is guaranteed to be of type C<double> (or
 better), so you can freely use C<double> yourself.
 
 Unlike the name component C<stamp> might indicate, it is also used for
@@ -47,15 +47,24 @@ time differences throughout libeio.
 
 =head2 FORK SUPPORT
 
-Calling C<fork ()> is fully supported by this module. It is implemented in these steps:
+Usage of pthreads in a program changes the semantics of fork
+considerably. Specifically, only async-safe functions can be called after
+fork. Libeio uses pthreads, so this applies, and makes using fork hard for
+anything but relatively fork + exec uses.
 
-   1. wait till all requests in "execute" state have been handled
-      (basically requests that are already handed over to the kernel).
-   2. fork
-   3. in the parent, continue business as usual, done
-   4. in the child, destroy all ready and pending requests and free the
-      memory used by the worker threads. This gives you a fully empty
-      libeio queue.
+This library only works in the process that initialised it: Forking is
+fully supported, but using libeio in any other process than the one that
+called C<eio_init> is not.
+
+You might get around by not I<using> libeio before (or after) forking in
+the parent, and using it in the child afterwards. You could also try to
+call the L<eio_init> function again in the child, which will brutally
+reinitialise all data structures, which isn't POSIX conformant, but
+typically works.
+
+Otherwise, the only recommendation you should follow is: treat fork code
+the same way you treat signal handlers, and only ever call C<eio_init> in
+the process that uses it, and only once ever.
 
 =head1 INITIALISATION/INTEGRATION
 
@@ -75,6 +84,9 @@ failure it returns C<-1> and sets C<errno> appropriately.
 It accepts two function pointers specifying callbacks as argument, both of
 which can be C<0>, in which case the callback isn't called.
 
+There is currently no way to change these callbacks later, or to
+"uninitialise" the library again.
+
 =item want_poll callback
 
 The C<want_poll> callback is invoked whenever libeio wants attention (i.e.
@@ -99,7 +111,7 @@ handled or C<done_poll> has been called, which signals the same.
 Note that C<eio_poll> might return after C<done_poll> and C<want_poll>
 have been called again, so watch out for races in your code.
 
-As with C<want_poll>, this callback is called while lcoks are being held,
+As with C<want_poll>, this callback is called while locks are being held,
 so you I<must not call any libeio functions form within this callback>.
 
 =item int eio_poll ()
@@ -121,19 +133,686 @@ returns C<-1>.
 For libev, you would typically use an C<ev_async> watcher: the
 C<want_poll> callback would invoke C<ev_async_send> to wake up the event
 loop. Inside the callback set for the watcher, one would call C<eio_poll
-()> (followed by C<ev_async_send> again if C<eio_poll> indicates that not
-all requests have been handled yet). The race is taken care of because
-libev resets/rearms the async watcher before calling your callback,
-and therefore, before calling C<eio_poll>. This might result in (some)
-spurious wake-ups, but is generally harmless.
+()>.
+
+If C<eio_poll ()> is configured to not handle all results in one go
+(i.e. it returns C<-1>) then you should start an idle watcher that calls
+C<eio_poll> until it returns something C<!= -1>.
+
+A full-featured connector between libeio and libev would look as follows
+(if C<eio_poll> is handling all requests, it can of course be simplified a
+lot by removing the idle watcher logic):
+
+  static struct ev_loop *loop;
+  static ev_idle repeat_watcher;
+  static ev_async ready_watcher;
+
+  /* idle watcher callback, only used when eio_poll */
+  /* didn't handle all results in one call */
+  static void
+  repeat (EV_P_ ev_idle *w, int revents)
+  {
+    if (eio_poll () != -1)
+      ev_idle_stop (EV_A_ w);
+  }
+
+  /* eio has some results, process them */
+  static void
+  ready (EV_P_ ev_async *w, int revents)
+  {
+    if (eio_poll () == -1)
+      ev_idle_start (EV_A_ &repeat_watcher);
+  }
+
+  /* wake up the event loop */
+  static void
+  want_poll (void)
+  {
+    ev_async_send (loop, &ready_watcher)
+  }
+
+  void
+  my_init_eio ()
+  {
+    loop = EV_DEFAULT;
+
+    ev_idle_init (&repeat_watcher, repeat);
+    ev_async_init (&ready_watcher, ready);
+    ev_async_start (loop &watcher);
+
+    eio_init (want_poll, 0);
+  }
 
 For most other event loops, you would typically use a pipe - the event
-loop should be told to wait for read readyness on the read end. In
+loop should be told to wait for read readiness on the read end. In
 C<want_poll> you would write a single byte, in C<done_poll> you would try
 to read that byte, and in the callback for the read end, you would call
-C<eio_poll>. The race is avoided here because the event loop should invoke
-your callback again and again until the byte has been read (as the pipe
-read callback does not read it, only C<done_poll>).
+C<eio_poll>.
+
+You don't have to take special care in the case C<eio_poll> doesn't handle
+all requests, as the done callback will not be invoked, so the event loop
+will still signal readiness for the pipe until I<all> results have been
+processed.
+
+
+=head1 HIGH LEVEL REQUEST API
+
+Libeio has both a high-level API, which consists of calling a request
+function with a callback to be called on completion, and a low-level API
+where you fill out request structures and submit them.
+
+This section describes the high-level API.
+
+=head2 REQUEST SUBMISSION AND RESULT PROCESSING
+
+You submit a request by calling the relevant C<eio_TYPE> function with the
+required parameters, a callback of type C<int (*eio_cb)(eio_req *req)>
+(called C<eio_cb> below) and a freely usable C<void *data> argument.
+
+The return value will either be 0, in case something went really wrong
+(which can basically only happen on very fatal errors, such as C<malloc>
+returning 0, which is rather unlikely), or a pointer to the newly-created
+and submitted C<eio_req *>.
+
+The callback will be called with an C<eio_req *> which contains the
+results of the request. The members you can access inside that structure
+vary from request to request, except for:
+
+=over 4
+
+=item C<ssize_t result>
+
+This contains the result value from the call (usually the same as the
+syscall of the same name).
+
+=item C<int errorno>
+
+This contains the value of C<errno> after the call.
+
+=item C<void *data>
+
+The C<void *data> member simply stores the value of the C<data> argument.
+
+=back
+
+The return value of the callback is normally C<0>, which tells libeio to
+continue normally. If a callback returns a nonzero value, libeio will
+stop processing results (in C<eio_poll>) and will return the value to its
+caller.
+
+Memory areas passed to libeio must stay valid as long as a request
+executes, with the exception of paths, which are being copied
+internally. Any memory libeio itself allocates will be freed after the
+finish callback has been called. If you want to manage all memory passed
+to libeio yourself you can use the low-level API.
+
+For example, to open a file, you could do this:
+
+  static int
+  file_open_done (eio_req *req)
+  {
+    if (req->result < 0)
+      {
+        /* open() returned -1 */
+        errno = req->errorno;
+        perror ("open");
+      }
+    else
+      {
+        int fd = req->result;
+        /* now we have the new fd in fd */
+      }
+
+    return 0;
+  }
+
+  /* the first three arguments are passed to open(2) */
+  /* the remaining are priority, callback and data */
+  if (!eio_open ("/etc/passwd", O_RDONLY, 0, 0, file_open_done, 0))
+    abort (); /* something went wrong, we will all die!!! */
+
+Note that you additionally need to call C<eio_poll> when the C<want_cb>
+indicates that requests are ready to be processed.
+
+=head2 CANCELLING REQUESTS
+
+Sometimes the need for a request goes away before the request is
+finished. In that case, one can cancel the request by a call to
+C<eio_cancel>:
+
+=over 4
+
+=item eio_cancel (eio_req *req)
+
+Cancel the request (and all its subrequests). If the request is currently
+executing it might still continue to execute, and in other cases it might
+still take a while till the request is cancelled.
+
+Even if cancelled, the finish callback will still be invoked - the
+callbacks of all cancellable requests need to check whether the request
+has been cancelled by calling C<EIO_CANCELLED (req)>:
+
+  static int
+  my_eio_cb (eio_req *req)
+  {
+    if (EIO_CANCELLED (req))
+      return 0;
+  }
+
+In addition, cancelled requests will I<either> have C<< req->result >>
+set to C<-1> and C<errno> to C<ECANCELED>, or I<otherwise> they were
+successfully executed, despite being cancelled (e.g. when they have
+already been executed at the time they were cancelled).
+
+C<EIO_CANCELLED> is still true for requests that have successfully
+executed, as long as C<eio_cancel> was called on them at some point.
+
+=back
+
+=head2 AVAILABLE REQUESTS
+
+The following request functions are available. I<All> of them return the
+C<eio_req *> on success and C<0> on failure, and I<all> of them have the
+same three trailing arguments: C<pri>, C<cb> and C<data>. The C<cb> is
+mandatory, but in most cases, you pass in C<0> as C<pri> and C<0> or some
+custom data value as C<data>.
+
+=head3 POSIX API WRAPPERS
+
+These requests simply wrap the POSIX call of the same name, with the same
+arguments. If a function is not implemented by the OS and cannot be emulated
+in some way, then all of these return C<-1> and set C<errorno> to C<ENOSYS>.
+
+=over 4
+
+=item eio_open      (const char *path, int flags, mode_t mode, int pri, eio_cb cb, void *data)
+
+=item eio_truncate  (const char *path, off_t offset, int pri, eio_cb cb, void *data)
+
+=item eio_chown     (const char *path, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data)
+
+=item eio_chmod     (const char *path, mode_t mode, int pri, eio_cb cb, void *data)
+
+=item eio_mkdir     (const char *path, mode_t mode, int pri, eio_cb cb, void *data)
+
+=item eio_rmdir     (const char *path, int pri, eio_cb cb, void *data)
+
+=item eio_unlink    (const char *path, int pri, eio_cb cb, void *data)
+
+=item eio_utime     (const char *path, eio_tstamp atime, eio_tstamp mtime, int pri, eio_cb cb, void *data)
+
+=item eio_mknod     (const char *path, mode_t mode, dev_t dev, int pri, eio_cb cb, void *data)
+
+=item eio_link      (const char *path, const char *new_path, int pri, eio_cb cb, void *data)
+
+=item eio_symlink   (const char *path, const char *new_path, int pri, eio_cb cb, void *data)
+
+=item eio_rename    (const char *path, const char *new_path, int pri, eio_cb cb, void *data)
+
+=item eio_mlock     (void *addr, size_t length, int pri, eio_cb cb, void *data)
+
+=item eio_close     (int fd, int pri, eio_cb cb, void *data)
+
+=item eio_sync      (int pri, eio_cb cb, void *data)
+
+=item eio_fsync     (int fd, int pri, eio_cb cb, void *data)
+
+=item eio_fdatasync (int fd, int pri, eio_cb cb, void *data)
+
+=item eio_futime    (int fd, eio_tstamp atime, eio_tstamp mtime, int pri, eio_cb cb, void *data)
+
+=item eio_ftruncate (int fd, off_t offset, int pri, eio_cb cb, void *data)
+
+=item eio_fchmod    (int fd, mode_t mode, int pri, eio_cb cb, void *data)
+
+=item eio_fchown    (int fd, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data)
+
+=item eio_dup2      (int fd, int fd2, int pri, eio_cb cb, void *data)
+
+These have the same semantics as the syscall of the same name, their
+return value is available as C<< req->result >> later.
+
+=item eio_read      (int fd, void *buf, size_t length, off_t offset, int pri, eio_cb cb, void *data)
+
+=item eio_write     (int fd, void *buf, size_t length, off_t offset, int pri, eio_cb cb, void *data)
+
+These two requests are called C<read> and C<write>, but actually wrap
+C<pread> and C<pwrite>. On systems that lack these calls (such as cygwin),
+libeio uses lseek/read_or_write/lseek and a mutex to serialise the
+requests, so all these requests run serially and do not disturb each
+other. However, they still disturb the file offset while they run, so it's
+not safe to call these functions concurrently with non-libeio functions on
+the same fd on these systems.
+
+Not surprisingly, pread and pwrite are not thread-safe on Darwin (OS/X),
+so it is advised not to submit multiple requests on the same fd on this
+horrible pile of garbage.
+
+=item eio_mlockall  (int flags, int pri, eio_cb cb, void *data)
+
+Like C<mlockall>, but the flag value constants are called
+C<EIO_MCL_CURRENT> and C<EIO_MCL_FUTURE>.
+
+=item eio_msync     (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data)
+
+Just like msync, except that the flag values are called C<EIO_MS_ASYNC>,
+C<EIO_MS_INVALIDATE> and C<EIO_MS_SYNC>.
+
+=item eio_readlink  (const char *path, int pri, eio_cb cb, void *data)
+
+If successful, the path read by C<readlink(2)> can be accessed via C<<
+req->ptr2 >> and is I<NOT> null-terminated, with the length specified as
+C<< req->result >>.
+
+  if (req->result >= 0)
+    {
+      char *target = strndup ((char *)req->ptr2, req->result);
+
+      free (target);
+    }
+
+=item eio_realpath  (const char *path, int pri, eio_cb cb, void *data)
+
+Similar to the realpath libc function, but unlike that one, C<<
+req->result >> is C<-1> on failure. On success, the result is the length
+of the returned path in C<ptr2> (which is I<NOT> 0-terminated) - this is
+similar to readlink.
+
+=item eio_stat      (const char *path, int pri, eio_cb cb, void *data)
+
+=item eio_lstat     (const char *path, int pri, eio_cb cb, void *data)
+
+=item eio_fstat     (int fd, int pri, eio_cb cb, void *data)
+   
+Stats a file - if C<< req->result >> indicates success, then you can
+access the C<struct stat>-like structure via C<< req->ptr2 >>:
+
+  EIO_STRUCT_STAT *statdata = (EIO_STRUCT_STAT *)req->ptr2;
+
+=item eio_statvfs   (const char *path, int pri, eio_cb cb, void *data)
+
+=item eio_fstatvfs  (int fd, int pri, eio_cb cb, void *data)
+
+Stats a filesystem - if C<< req->result >> indicates success, then you can
+access the C<struct statvfs>-like structure via C<< req->ptr2 >>:
+
+  EIO_STRUCT_STATVFS *statdata = (EIO_STRUCT_STATVFS *)req->ptr2;
+
+=back
+
+=head3 READING DIRECTORIES
+
+Reading directories sounds simple, but can be rather demanding, especially
+if you want to do stuff such as traversing a directory hierarchy or
+processing all files in a directory. Libeio can assist these complex tasks
+with it's C<eio_readdir> call.
+
+=over 4
+
+=item eio_readdir (const char *path, int flags, int pri, eio_cb cb, void *data)
+
+This is a very complex call. It basically reads through a whole directory
+(via the C<opendir>, C<readdir> and C<closedir> calls) and returns either
+the names or an array of C<struct eio_dirent>, depending on the C<flags>
+argument.
+
+The C<< req->result >> indicates either the number of files found, or
+C<-1> on error. On success, null-terminated names can be found as C<< req->ptr2 >>,
+and C<struct eio_dirents>, if requested by C<flags>, can be found via C<<
+req->ptr1 >>.
+
+Here is an example that prints all the names:
+
+  int i;
+  char *names = (char *)req->ptr2;
+
+  for (i = 0; i < req->result; ++i)
+    {
+      printf ("name #%d: %s\n", i, names);
+
+      /* move to next name */
+      names += strlen (names) + 1;
+    }
+
+Pseudo-entries such as F<.> and F<..> are never returned by C<eio_readdir>.
+
+C<flags> can be any combination of:
+
+=over 4
+
+=item EIO_READDIR_DENTS
+
+If this flag is specified, then, in addition to the names in C<ptr2>,
+also an array of C<struct eio_dirent> is returned, in C<ptr1>. A C<struct
+eio_dirent> looks like this:
+
+  struct eio_dirent
+  {
+    int nameofs; /* offset of null-terminated name string in (char *)req->ptr2 */
+    unsigned short namelen; /* size of filename without trailing 0 */
+    unsigned char type; /* one of EIO_DT_* */
+    signed char score; /* internal use */
+    ino_t inode; /* the inode number, if available, otherwise unspecified */
+  };
+
+The only members you normally would access are C<nameofs>, which is the
+byte-offset from C<ptr2> to the start of the name, C<namelen> and C<type>.
+
+C<type> can be one of:
+
+C<EIO_DT_UNKNOWN> - if the type is not known (very common) and you have to C<stat>
+the name yourself if you need to know,
+one of the "standard" POSIX file types (C<EIO_DT_REG>, C<EIO_DT_DIR>, C<EIO_DT_LNK>,
+C<EIO_DT_FIFO>, C<EIO_DT_SOCK>, C<EIO_DT_CHR>, C<EIO_DT_BLK>)
+or some OS-specific type (currently
+C<EIO_DT_MPC> - multiplexed char device (v7+coherent),
+C<EIO_DT_NAM> - xenix special named file,
+C<EIO_DT_MPB> - multiplexed block device (v7+coherent),
+C<EIO_DT_NWK> - HP-UX network special,
+C<EIO_DT_CMP> - VxFS compressed,
+C<EIO_DT_DOOR> - solaris door, or
+C<EIO_DT_WHT>).
+
+This example prints all names and their type:
+
+  int i;
+  struct eio_dirent *ents = (struct eio_dirent *)req->ptr1;
+  char *names = (char *)req->ptr2;
+
+  for (i = 0; i < req->result; ++i)
+    {
+      struct eio_dirent *ent = ents + i;
+      char *name = names + ent->nameofs;
+
+      printf ("name #%d: %s (type %d)\n", i, name, ent->type);
+    }
+
+=item EIO_READDIR_DIRS_FIRST
+
+When this flag is specified, then the names will be returned in an order
+where likely directories come first, in optimal C<stat> order. This is
+useful when you need to quickly find directories, or you want to find all
+directories while avoiding to stat() each entry.
+
+If the system returns type information in readdir, then this is used
+to find directories directly. Otherwise, likely directories are names
+beginning with ".", or otherwise names with no dots, of which names with
+short names are tried first.
+
+=item EIO_READDIR_STAT_ORDER
+
+When this flag is specified, then the names will be returned in an order
+suitable for stat()'ing each one. That is, when you plan to stat()
+all files in the given directory, then the returned order will likely
+be fastest.
+
+If both this flag and C<EIO_READDIR_DIRS_FIRST> are specified, then the
+likely directories come first, resulting in a less optimal stat order.
+
+=item EIO_READDIR_FOUND_UNKNOWN
+
+This flag should not be specified when calling C<eio_readdir>. Instead,
+it is being set by C<eio_readdir> (you can access the C<flags> via C<<
+req->int1 >>, when any of the C<type>'s found were C<EIO_DT_UNKNOWN>. The
+absence of this flag therefore indicates that all C<type>'s are known,
+which can be used to speed up some algorithms.
+
+A typical use case would be to identify all subdirectories within a
+directory - you would ask C<eio_readdir> for C<EIO_READDIR_DIRS_FIRST>. If
+then this flag is I<NOT> set, then all the entries at the beginning of the
+returned array of type C<EIO_DT_DIR> are the directories. Otherwise, you
+should start C<stat()>'ing the entries starting at the beginning of the
+array, stopping as soon as you found all directories (the count can be
+deduced by the link count of the directory).
+
+=back
+
+=back
+
+=head3 OS-SPECIFIC CALL WRAPPERS
+
+These wrap OS-specific calls (usually Linux ones), and might or might not
+be emulated on other operating systems. Calls that are not emulated will
+return C<-1> and set C<errno> to C<ENOSYS>.
+
+=over 4
+
+=item eio_sendfile (int out_fd, int in_fd, off_t in_offset, size_t length, int pri, eio_cb cb, void *data)
+
+Wraps the C<sendfile> syscall. The arguments follow the Linux version, but
+libeio supports and will use similar calls on FreeBSD, HP/UX, Solaris and
+Darwin.
+
+If the OS doesn't support some sendfile-like call, or the call fails,
+indicating support for the given file descriptor type (for example,
+Linux's sendfile might not support file to file copies), then libeio will
+emulate the call in userspace, so there are almost no limitations on its
+use.
+
+=item eio_readahead (int fd, off_t offset, size_t length, int pri, eio_cb cb, void *data)
+
+Calls C<readahead(2)>. If the syscall is missing, then the call is
+emulated by simply reading the data (currently in 64kiB chunks).
+
+=item eio_syncfs (int fd, int pri, eio_cb cb, void *data)
+
+Calls Linux' C<syncfs> syscall, if available. Returns C<-1> and sets
+C<errno> to C<ENOSYS> if the call is missing I<but still calls sync()>,
+if the C<fd> is C<< >= 0 >>, so you can probe for the availability of the
+syscall with a negative C<fd> argument and checking for C<-1/ENOSYS>.
+
+=item eio_sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags, int pri, eio_cb cb, void *data)
+
+Calls C<sync_file_range>. If the syscall is missing, then this is the same
+as calling C<fdatasync>.
+
+Flags can be any combination of C<EIO_SYNC_FILE_RANGE_WAIT_BEFORE>,
+C<EIO_SYNC_FILE_RANGE_WRITE> and C<EIO_SYNC_FILE_RANGE_WAIT_AFTER>.
+
+=item eio_fallocate (int fd, int mode, off_t offset, off_t len, int pri, eio_cb cb, void *data)
+
+Calls C<fallocate> (note: I<NOT> C<posix_fallocate>!). If the syscall is
+missing, then it returns failure and sets C<errno> to C<ENOSYS>.
+
+The C<mode> argument can be C<0> (for behaviour similar to
+C<posix_fallocate>), or C<EIO_FALLOC_FL_KEEP_SIZE>, which keeps the size
+of the file unchanged (but still preallocates space beyond end of file).
+
+=back
+
+=head3 LIBEIO-SPECIFIC REQUESTS
+
+These requests are specific to libeio and do not correspond to any OS call.
+
+=over 4
+
+=item eio_mtouch (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data)
+
+Reads (C<flags == 0>) or modifies (C<flags == EIO_MT_MODIFY) the given
+memory area, page-wise, that is, it reads (or reads and writes back) the
+first octet of every page that spans the memory area.
+
+This can be used to page in some mmapped file, or dirty some pages. Note
+that dirtying is an unlocked read-write access, so races can ensue when
+the some other thread modifies the data stored in that memory area.
+
+=item eio_custom (void (*)(eio_req *) execute, int pri, eio_cb cb, void *data)
+
+Executes a custom request, i.e., a user-specified callback.
+
+The callback gets the C<eio_req *> as parameter and is expected to read
+and modify any request-specific members. Specifically, it should set C<<
+req->result >> to the result value, just like other requests.
+
+Here is an example that simply calls C<open>, like C<eio_open>, but it
+uses the C<data> member as filename and uses a hardcoded C<O_RDONLY>. If
+you want to pass more/other parameters, you either need to pass some
+struct or so via C<data> or provide your own wrapper using the low-level
+API.
+
+  static int
+  my_open_done (eio_req *req)
+  {
+    int fd = req->result;
+
+    return 0;
+  }
+
+  static void
+  my_open (eio_req *req)
+  {
+    req->result = open (req->data, O_RDONLY);
+  }
+
+  eio_custom (my_open, 0, my_open_done, "/etc/passwd");
+
+=item eio_busy (eio_tstamp delay, int pri, eio_cb cb, void *data)
+
+This is a request that takes C<delay> seconds to execute, but otherwise
+does nothing - it simply puts one of the worker threads to sleep for this
+long.
+
+This request can be used to artificially increase load, e.g. for debugging
+or benchmarking reasons.
+
+=item eio_nop (int pri, eio_cb cb, void *data)
+   
+This request does nothing, except go through the whole request cycle. This
+can be used to measure latency or in some cases to simplify code, but is
+not really of much use.
+
+=back
+
+=head3 GROUPING AND LIMITING REQUESTS
+
+There is one more rather special request, C<eio_grp>. It is a very special
+aio request: Instead of doing something, it is a container for other eio
+requests.
+
+There are two primary use cases for this: a) bundle many requests into a
+single, composite, request with a definite callback and the ability to
+cancel the whole request with its subrequests and b) limiting the number
+of "active" requests.
+
+Further below you will find more discussion of these topics - first
+follows the reference section detailing the request generator and other
+methods.
+
+=over 4
+
+=item eio_req *grp = eio_grp (eio_cb cb, void *data)
+
+Creates, submits and returns a group request. Note that it doesn't have a
+priority, unlike all other requests.
+
+=item eio_grp_add (eio_req *grp, eio_req *req)
+
+Adds a request to the request group.
+
+=item eio_grp_cancel (eio_req *grp)
+
+Cancels all requests I<in> the group, but I<not> the group request
+itself. You can cancel the group request I<and> all subrequests via a
+normal C<eio_cancel> call.
+
+=back
+
+=head4 GROUP REQUEST LIFETIME
+
+Left alone, a group request will instantly move to the pending state and
+will be finished at the next call of C<eio_poll>.
+
+The usefulness stems from the fact that, if a subrequest is added to a
+group I<before> a call to C<eio_poll>, via C<eio_grp_add>, then the group
+will not finish until all the subrequests have finished.
+
+So the usage cycle of a group request is like this: after it is created,
+you normally instantly add a subrequest. If none is added, the group
+request will finish on it's own. As long as subrequests are added before
+the group request is finished it will be kept from finishing, that is the
+callbacks of any subrequests can, in turn, add more requests to the group,
+and as long as any requests are active, the group request itself will not
+finish.
+
+=head4 CREATING COMPOSITE REQUESTS
+
+Imagine you wanted to create an C<eio_load> request that opens a file,
+reads it and closes it. This means it has to execute at least three eio
+requests, but for various reasons it might be nice if that request looked
+like any other eio request.
+
+This can be done with groups:
+
+=over 4
+
+=item 1) create the request object
+
+Create a group that contains all further requests. This is the request you
+can return as "the load request".
+
+=item 2) open the file, maybe
+
+Next, open the file with C<eio_open> and add the request to the group
+request and you are finished setting up the request.
+
+If, for some reason, you cannot C<eio_open> (path is a null ptr?) you
+can set C<< grp->result >> to C<-1> to signal an error and let the group
+request finish on its own.
+
+=item 3) open callback adds more requests
+
+In the open callback, if the open was not successful, copy C<<
+req->errorno >> to C<< grp->errorno >> and set C<< grp->errorno >> to
+C<-1> to signal an error.
+
+Otherwise, malloc some memory or so and issue a read request, adding the
+read request to the group.
+
+=item 4) continue issuing requests till finished
+
+In the real callback, check for errors and possibly continue with
+C<eio_close> or any other eio request in the same way.
+
+As soon as no new requests are added the group request will finish. Make
+sure you I<always> set C<< grp->result >> to some sensible value.
+
+=back
+
+=head4 REQUEST LIMITING
+
+
+#TODO
+
+void eio_grp_limit     (eio_req *grp, int limit);
+
+
+=back
+
+
+=head1 LOW LEVEL REQUEST API
+
+#TODO
+
+
+=head1 ANATOMY AND LIFETIME OF AN EIO REQUEST
+
+A request is represented by a structure of type C<eio_req>. To initialise
+it, clear it to all zero bytes:
+
+  eio_req req;
+
+  memset (&req, 0, sizeof (req));
+
+A more common way to initialise a new C<eio_req> is to use C<calloc>:
+
+  eio_req *req = calloc (1, sizeof (*req));
+
+In either case, libeio neither allocates, initialises or frees the
+C<eio_req> structure for you - it merely uses it.
+
+zero
+
+#TODO
 
 =head2 CONFIGURATION
 
@@ -154,14 +833,18 @@ C<0.01> seconds or so.
 
 Note that:
 
-a) libeio doesn't know how long your request callbacks take, so the time
-spent in C<eio_poll> is up to one callback invocation longer then this
-interval.
+=over 4
 
-b) this is implemented by calling C<gettimeofday> after each request,
-which can be costly.
+=item a) libeio doesn't know how long your request callbacks take, so the
+time spent in C<eio_poll> is up to one callback invocation longer then
+this interval.
 
-c) at least one request will be handled.
+=item b) this is implemented by calling C<gettimeofday> after each
+request, which can be costly.
+
+=item c) at least one request will be handled.
+
+=back
 
 =item eio_set_max_poll_reqs (unsigned int nreqs)
 
@@ -187,7 +870,7 @@ Set the maximum number of threads that libeio will spawn.
 Libeio uses threads internally to handle most requests, and will start and stop threads on demand.
 
 This call can be used to limit the number of idle threads (threads without
-work to do): libeio will keep some threads idle in preperation for more
+work to do): libeio will keep some threads idle in preparation for more
 requests, but never longer than C<nthreads> threads.
 
 In addition to this, libeio will also stop threads when they are idle for
@@ -216,23 +899,6 @@ C<eio_poll>).
 
 =back
 
-
-=head1 ANATOMY OF AN EIO REQUEST
-
-#TODO
-
-
-=head1 HIGH LEVEL REQUEST API
-
-#TODO
-
-=back
-
-
-=head1 LOW LEVEL REQUEST API
-
-#TODO
-
 =head1 EMBEDDING
 
 Libeio can be embedded directly into programs. This functionality is not
@@ -258,7 +924,7 @@ was written to use very little stackspace, but when using C<EIO_CUSTOM>
 requests, you might want to increase this.
 
 If this symbol is undefined (the default) then libeio will use its default
-stack size (C<sizeof (long) * 4096> currently).  If it is defined, but
+stack size (C<sizeof (void *) * 4096> currently). If it is defined, but
 C<0>, then the default operating system stack size will be used. In all
 other cases, the value must be an expression that evaluates to the desired
 stack size.
diff --git a/src/eio/libeio.m4 b/src/eio/libeio.m4
index 5302cfcc..59151f53 100644
--- a/src/eio/libeio.m4
+++ b/src/eio/libeio.m4
@@ -1,3 +1,7 @@
+dnl openbsd in it's neverending brokenness requires stdint.h for intptr_t,
+dnl but that header isn't very portable...
+AC_CHECK_HEADERS([stdint.h sys/syscall.h sys/prctl.h])
+
 AC_SEARCH_LIBS(
    pthread_create,
    [pthread pthreads pthreadVC2],
@@ -119,6 +123,41 @@ int main (void)
 ],ac_cv_sync_file_range=yes,ac_cv_sync_file_range=no)])
 test $ac_cv_sync_file_range = yes && AC_DEFINE(HAVE_SYNC_FILE_RANGE, 1, sync_file_range(2) is available)
 
+AC_CACHE_CHECK(for fallocate, ac_cv_fallocate, [AC_LINK_IFELSE([
+#include <fcntl.h>
+int main (void)
+{
+   int fd = 0;
+   int mode = 0;
+   off_t offset = 1;
+   off_t len = 1;
+   int res;
+   res = fallocate (fd, mode, offset, len);
+   return 0;
+}
+],ac_cv_fallocate=yes,ac_cv_fallocate=no)])
+test $ac_cv_fallocate = yes && AC_DEFINE(HAVE_FALLOCATE, 1, fallocate(2) is available)
+
+AC_CACHE_CHECK(for sys_syncfs, ac_cv_sys_syncfs, [AC_LINK_IFELSE([
+#include <unistd.h>
+#include <sys/syscall.h>
+int main (void)
+{
+  int res = syscall (__NR_syncfs, (int)0);
+}
+],ac_cv_sys_syncfs=yes,ac_cv_sys_syncfs=no)])
+test $ac_cv_sys_syncfs = yes && AC_DEFINE(HAVE_SYS_SYNCFS, 1, syscall(__NR_syncfs) is available)
+
+AC_CACHE_CHECK(for prctl_set_name, ac_cv_prctl_set_name, [AC_LINK_IFELSE([
+#include <sys/prctl.h>
+int main (void)
+{
+  char name[] = "test123";
+  int res = prctl (PR_SET_NAME, (unsigned long)name, 0, 0, 0);
+}
+],ac_cv_prctl_set_name=yes,ac_cv_prctl_set_name=no)])
+test $ac_cv_prctl_set_name = yes && AC_DEFINE(HAVE_PRCTL_SET_NAME, 1, prctl(PR_SET_NAME) is available)
+
 dnl #############################################################################
 dnl # these checks exist for the benefit of IO::AIO
 
diff --git a/src/eio/xthread.h b/src/eio/xthread.h
index 01e29335..1879866f 100644
--- a/src/eio/xthread.h
+++ b/src/eio/xthread.h
@@ -2,7 +2,7 @@
 #define XTHREAD_H_
 
 /* whether word reads are potentially non-atomic.
- * this is conservatice, likely most arches this runs
+ * this is conservative, likely most arches this runs
  * on have atomic word read/writes.
  */
 #ifndef WORDACCESS_UNSAFE
@@ -17,14 +17,8 @@
 
 #ifdef _WIN32
 
-#ifndef __MINGW32__
-typedef int ssize_t
-#endif
-
 #define NTDDI_VERSION NTDDI_WIN2K // needed to get win2000 api calls
-#ifndef _WIN32_WINNT
 #define _WIN32_WINNT 0x400
-#endif
 #include <stdio.h>//D
 #include <fcntl.h>
 #include <io.h>
@@ -34,18 +28,20 @@ typedef int ssize_t
 #include <windows.h>
 #include <pthread.h>
 #define sigset_t int
+#define sigfillset(a)
 #define pthread_sigmask(a,b,c)
 #define sigaddset(a,b)
 #define sigemptyset(s)
-#define sigfillset(s)
 
 typedef pthread_mutex_t xmutex_t;
 #define X_MUTEX_INIT           PTHREAD_MUTEX_INITIALIZER
+#define X_MUTEX_CREATE(mutex)  pthread_mutex_init (&(mutex), 0)
 #define X_LOCK(mutex)          pthread_mutex_lock (&(mutex))
 #define X_UNLOCK(mutex)        pthread_mutex_unlock (&(mutex))
 
 typedef pthread_cond_t xcond_t;
 #define X_COND_INIT                     PTHREAD_COND_INITIALIZER
+#define X_COND_CREATE(cond)		pthread_cond_init (&(cond), 0)
 #define X_COND_SIGNAL(cond)             pthread_cond_signal (&(cond))
 #define X_COND_WAIT(cond,mutex)         pthread_cond_wait (&(cond), &(mutex))
 #define X_COND_TIMEDWAIT(cond,mutex,to) pthread_cond_timedwait (&(cond), &(mutex), &(to))
@@ -100,18 +96,27 @@ thread_create (xthread_t *tid, void *(*proc)(void *), void *arg)
 
 typedef pthread_mutex_t xmutex_t;
 #if __linux && defined (PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP)
-# define X_MUTEX_INIT PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+# define X_MUTEX_INIT		PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+# define X_MUTEX_CREATE(mutex)						\
+  do {									\
+    pthread_mutexattr_t attr;						\
+    pthread_mutexattr_init (&attr);					\
+    pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);	\
+    pthread_mutex_init (&(mutex), &attr);				\
+  } while (0)
 #else
-# define X_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
+# define X_MUTEX_INIT		PTHREAD_MUTEX_INITIALIZER
+# define X_MUTEX_CREATE(mutex)	pthread_mutex_init (&(mutex), 0)
 #endif
-#define X_LOCK(mutex)   pthread_mutex_lock   (&(mutex))
-#define X_UNLOCK(mutex) pthread_mutex_unlock (&(mutex))
+#define X_LOCK(mutex)		pthread_mutex_lock   (&(mutex))
+#define X_UNLOCK(mutex)		pthread_mutex_unlock (&(mutex))
 
 typedef pthread_cond_t xcond_t;
-#define X_COND_INIT PTHREAD_COND_INITIALIZER
-#define X_COND_SIGNAL(cond) pthread_cond_signal (&(cond))
-#define X_COND_WAIT(cond,mutex) pthread_cond_wait (&(cond), &(mutex))
-#define X_COND_TIMEDWAIT(cond,mutex,to) pthread_cond_timedwait (&(cond), &(mutex), &(to))
+#define X_COND_INIT			PTHREAD_COND_INITIALIZER
+#define X_COND_CREATE(cond)		pthread_cond_init (&(cond), 0)
+#define X_COND_SIGNAL(cond)		pthread_cond_signal (&(cond))
+#define X_COND_WAIT(cond,mutex)		pthread_cond_wait (&(cond), &(mutex))
+#define X_COND_TIMEDWAIT(cond,mutex,to)	pthread_cond_timedwait (&(cond), &(mutex), &(to))
 
 typedef pthread_t xthread_t;
 #define X_THREAD_PROC(name) static void *name (void *thr_arg)
@@ -122,8 +127,8 @@ typedef pthread_t xthread_t;
 # define PTHREAD_STACK_MIN 0
 #endif
 
-#ifndef X_STACKSIZE
-# define X_STACKSIZE sizeof (long) * 4096
+#ifndef XTHREAD_STACKSIZE
+# define XTHREAD_STACKSIZE sizeof (void *) * 4096
 #endif
 
 static int