Work around windows udp bug, allow zero reads

This commit is contained in:
Bert Belder 2011-10-20 10:11:04 -07:00
parent e8a418e920
commit 51e9dbc2bb
8 changed files with 410 additions and 60 deletions

View File

@ -104,6 +104,27 @@
DWORD dwFlags);
#endif
typedef int (WSAAPI* LPFN_WSARECV)
(SOCKET socket,
LPWSABUF buffers,
DWORD buffer_count,
LPDWORD bytes,
LPDWORD flags,
LPWSAOVERLAPPED overlapped,
LPWSAOVERLAPPED_COMPLETION_ROUTINE
completion_routine);
typedef int (WSAAPI* LPFN_WSARECVFROM)
(SOCKET socket,
LPWSABUF buffers,
DWORD buffer_count,
LPDWORD bytes,
LPDWORD flags,
struct sockaddr* addr,
LPINT addr_len,
LPWSAOVERLAPPED overlapped,
LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine);
/**
* It should be possible to cast uv_buf_t[] to WSABUF[]
@ -236,7 +257,9 @@ RB_HEAD(uv_timer_tree_s, uv_timer_s);
struct sockaddr_storage recv_from; \
int recv_from_len; \
uv_udp_recv_cb recv_cb; \
uv_alloc_cb alloc_cb;
uv_alloc_cb alloc_cb; \
LPFN_WSARECV func_wsarecv; \
LPFN_WSARECVFROM func_wsarecvfrom;
#define uv_pipe_server_fields \
uv_pipe_accept_t accept_reqs[4]; \

View File

@ -1061,7 +1061,7 @@ int uv_fs_lstat(uv_loop_t* loop, uv_fs_t* req, const char* path, uv_fs_cb cb);
int uv_fs_link(uv_loop_t* loop, uv_fs_t* req, const char* path,
const char* new_path, uv_fs_cb cb);
/*
/*
* This flag can be used with uv_fs_symlink on Windows
* to specify whether path argument points to a directory.
*/

View File

@ -307,14 +307,40 @@ uv_err_code uv_translate_sys_error(int sys_errno);
/*
* Initialization for the windows and winsock api
* Winapi and ntapi utility functions
*/
void uv_winapi_init();
/*
* Winsock utility functions
*/
void uv_winsock_init();
int uv_ntstatus_to_winsock_error(NTSTATUS status);
BOOL uv_get_acceptex_function(SOCKET socket, LPFN_ACCEPTEX* target);
BOOL uv_get_connectex_function(SOCKET socket, LPFN_CONNECTEX* target);
/* Threads and synchronization */
int WSAAPI uv_wsarecv_workaround(SOCKET socket, WSABUF* buffers,
DWORD buffer_count, DWORD* bytes, DWORD* flags, WSAOVERLAPPED *overlapped,
LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine);
int WSAAPI uv_wsarecvfrom_workaround(SOCKET socket, WSABUF* buffers,
DWORD buffer_count, DWORD* bytes, DWORD* flags, struct sockaddr* addr,
int* addr_len, WSAOVERLAPPED *overlapped,
LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine);
/* Whether ipv6 is supported */
extern int uv_allow_ipv6;
/* Ip address used to bind to any port at any interface */
extern struct sockaddr_in uv_addr_ip4_any_;
extern struct sockaddr_in6 uv_addr_ip6_any_;
/*
* Threads and synchronization
*/
typedef struct uv_once_s {
unsigned char ran;
/* The actual event handle must be aligned to sizeof(HANDLE), so in */

View File

@ -24,9 +24,8 @@
#include "uv.h"
#include "../uv-common.h"
#include "internal.h"
#include <stdio.h>
#if 0
/*
* Threshold of active udp streams for which to preallocate udp read buffers.
*/
@ -34,7 +33,6 @@ const unsigned int uv_active_udp_streams_threshold = 0;
/* A zero-size buffer for use by uv_udp_read */
static char uv_zero_[] = "";
#endif
/* Counter to keep track of active udp streams */
static unsigned int active_udp_streams = 0;
@ -63,6 +61,8 @@ int uv_udp_getsockname(uv_udp_t* handle, struct sockaddr* name,
static int uv_udp_set_socket(uv_loop_t* loop, uv_udp_t* handle,
SOCKET socket) {
DWORD yes = 1;
WSAPROTOCOL_INFOW info;
int opt_len;
assert(handle->socket == INVALID_SOCKET);
@ -89,14 +89,33 @@ static int uv_udp_set_socket(uv_loop_t* loop, uv_udp_t* handle,
}
if (pSetFileCompletionNotificationModes) {
if (pSetFileCompletionNotificationModes((HANDLE)socket,
FILE_SKIP_SET_EVENT_ON_HANDLE |
FILE_SKIP_COMPLETION_PORT_ON_SUCCESS)) {
handle->flags |= UV_HANDLE_SYNC_BYPASS_IOCP;
} else if (GetLastError() != ERROR_INVALID_FUNCTION) {
/* All know windowses that support SetFileCompletionNotificationModes */
/* have a bug that makes it impossible to use this function in */
/* conjunction with datagram sockets. We can work around that but only */
/* if the user is using the default UDP driver (AFD) and has no other */
/* LSPs stacked on top. Here we check whether that is the case. */
opt_len = (int) sizeof info;
if (!getsockopt(socket,
SOL_SOCKET,
SO_PROTOCOL_INFOW,
(char*) &info,
&opt_len) == SOCKET_ERROR) {
uv__set_sys_error(loop, GetLastError());
return -1;
}
if (info.ProtocolChain.ChainLen == 1) {
if (pSetFileCompletionNotificationModes((HANDLE)socket,
FILE_SKIP_SET_EVENT_ON_HANDLE |
FILE_SKIP_COMPLETION_PORT_ON_SUCCESS)) {
handle->flags |= UV_HANDLE_SYNC_BYPASS_IOCP;
handle->func_wsarecv = uv_wsarecv_workaround;
handle->func_wsarecvfrom = uv_wsarecvfrom_workaround;
} else if (GetLastError() != ERROR_INVALID_FUNCTION) {
uv__set_sys_error(loop, GetLastError());
return -1;
}
}
}
handle->socket = socket;
@ -111,6 +130,8 @@ int uv_udp_init(uv_loop_t* loop, uv_udp_t* handle) {
handle->reqs_pending = 0;
handle->loop = loop;
handle->flags = 0;
handle->func_wsarecv = WSARecv;
handle->func_wsarecvfrom = WSARecvFrom;
uv_req_init(loop, (uv_req_t*) &(handle->recv_req));
handle->recv_req.type = UV_UDP_RECV;
@ -248,10 +269,9 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) {
* Preallocate a read buffer if the number of active streams is below
* the threshold.
*/
#if 0
if (active_udp_streams < uv_active_udp_streams_threshold) {
handle->flags &= ~UV_HANDLE_ZERO_READ;
#endif
handle->recv_buffer = handle->alloc_cb((uv_handle_t*) handle, 65536);
assert(handle->recv_buffer.len > 0);
@ -260,15 +280,15 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) {
handle->recv_from_len = sizeof handle->recv_from;
flags = 0;
result = WSARecvFrom(handle->socket,
(WSABUF*) &buf,
1,
&bytes,
&flags,
(struct sockaddr*) &handle->recv_from,
&handle->recv_from_len,
&req->overlapped,
NULL);
result = handle->func_wsarecvfrom(handle->socket,
(WSABUF*) &buf,
1,
&bytes,
&flags,
(struct sockaddr*) &handle->recv_from,
&handle->recv_from_len,
&req->overlapped,
NULL);
if (UV_SUCCEEDED_WITHOUT_IOCP(result == 0)) {
/* Process the req without IOCP. */
@ -286,21 +306,21 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) {
uv_insert_pending_req(loop, req);
handle->reqs_pending++;
}
#if 0
} else {
handle->flags |= UV_HANDLE_ZERO_READ;
buf.base = (char*) uv_zero_;
buf.len = 0;
flags = MSG_PARTIAL;
flags = MSG_PEEK;
result = WSARecv(handle->socket,
(WSABUF*) &buf,
1,
&bytes,
&flags,
&req->overlapped,
NULL);
result = handle->func_wsarecv(handle->socket,
(WSABUF*) &buf,
1,
&bytes,
&flags,
&req->overlapped,
NULL);
if (UV_SUCCEEDED_WITHOUT_IOCP(result == 0)) {
/* Process the req without IOCP. */
@ -319,7 +339,6 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) {
handle->reqs_pending++;
}
}
#endif
}
@ -448,34 +467,27 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle,
handle->flags &= ~UV_HANDLE_READ_PENDING;
if (!REQ_SUCCESS(req) &&
GET_REQ_STATUS(req) != STATUS_RECEIVE_EXPEDITED) {
GET_REQ_SOCK_ERROR(req) != WSAEMSGSIZE) {
/* An error occurred doing the read. */
if ((handle->flags & UV_HANDLE_READING)) {
uv__set_sys_error(loop, GET_REQ_SOCK_ERROR(req));
if (handle->flags & UV_HANDLE_READING) {
uv__set_sys_error(loop, GET_REQ_SOCK_ERROR(req));
uv_udp_recv_stop(handle);
#if 0
buf = (handle->flags & UV_HANDLE_ZERO_READ) ?
uv_buf_init(NULL, 0) : handle->recv_buffer;
#else
buf = handle->recv_buffer;
#endif
handle->recv_cb(handle, -1, buf, NULL, 0);
}
goto done;
}
#if 0
if (!(handle->flags & UV_HANDLE_ZERO_READ)) {
#endif
/* Successful read */
partial = (GET_REQ_STATUS(req) == STATUS_RECEIVE_EXPEDITED);
partial = !REQ_SUCCESS(req);
handle->recv_cb(handle,
req->overlapped.InternalHigh,
handle->recv_buffer,
(struct sockaddr*) &handle->recv_from,
partial ? UV_UDP_PARTIAL : 0);
#if 0
} else {
} else if (handle->flags & UV_HANDLE_READING) {
DWORD bytes, err, flags;
struct sockaddr_storage from;
int from_len;
@ -487,7 +499,8 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle,
memset(&from, 0, sizeof from);
from_len = sizeof from;
flags = MSG_PARTIAL;
flags = 0;
if (WSARecvFrom(handle->socket,
(WSABUF*)&buf,
@ -500,14 +513,18 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle,
NULL) != SOCKET_ERROR) {
/* Message received */
handle->recv_cb(handle,
bytes,
buf,
(struct sockaddr*) &from,
(flags & MSG_PARTIAL) ? UV_UDP_PARTIAL : 0);
handle->recv_cb(handle, bytes, buf, (struct sockaddr*) &from, 0);
} else {
err = WSAGetLastError();
if (err == WSAEWOULDBLOCK) {
if (err == WSAEMSGSIZE) {
/* Message truncated */
handle->recv_cb(handle,
bytes,
buf,
(struct sockaddr*) &from,
UV_UDP_PARTIAL);
} if (err == WSAEWOULDBLOCK) {
/* Kernel buffer empty */
uv__set_sys_error(loop, WSAEWOULDBLOCK);
handle->recv_cb(handle, 0, buf, NULL, 0);
} else {
@ -517,7 +534,6 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle,
}
}
}
#endif
done:
/* Post another read if still reading and not closing. */

View File

@ -27,6 +27,7 @@
sRtlNtStatusToDosError pRtlNtStatusToDosError;
sNtDeviceIoControlFile pNtDeviceIoControlFile;
sNtQueryInformationFile pNtQueryInformationFile;
sNtSetInformationFile pNtSetInformationFile;
sGetQueuedCompletionStatusEx pGetQueuedCompletionStatusEx;
@ -57,6 +58,13 @@ void uv_winapi_init() {
uv_fatal_error(GetLastError(), "GetProcAddress");
}
pNtDeviceIoControlFile = (sNtDeviceIoControlFile) GetProcAddress(
ntdll_module,
"NtDeviceIoControlFile");
if (pNtDeviceIoControlFile == NULL) {
uv_fatal_error(GetLastError(), "GetProcAddress");
}
pNtSetInformationFile = (sNtSetInformationFile) GetProcAddress(
ntdll_module,
"NtSetInformationFile");

View File

@ -4270,9 +4270,26 @@ typedef enum _FILE_INFORMATION_CLASS {
FILE_SPECIAL_ACCESS)
#endif
typedef VOID (NTAPI *PIO_APC_ROUTINE)
(PVOID ApcContext,
PIO_STATUS_BLOCK IoStatusBlock,
ULONG Reserved);
typedef ULONG (NTAPI *sRtlNtStatusToDosError)
(NTSTATUS Status);
typedef NTSTATUS (NTAPI *sNtDeviceIoControlFile)
(HANDLE FileHandle,
HANDLE Event,
PIO_APC_ROUTINE ApcRoutine,
PVOID ApcContext,
PIO_STATUS_BLOCK IoStatusBlock,
ULONG IoControlCode,
PVOID InputBuffer,
ULONG InputBufferLength,
PVOID OutputBuffer,
ULONG OutputBufferLength);
typedef NTSTATUS (NTAPI *sNtQueryInformationFile)
(HANDLE FileHandle,
PIO_STATUS_BLOCK IoStatusBlock,
@ -4325,6 +4342,7 @@ typedef BOOLEAN (WINAPI* sCreateSymbolicLinkW)
/* Ntapi function pointers */
extern sRtlNtStatusToDosError pRtlNtStatusToDosError;
extern sNtDeviceIoControlFile pNtDeviceIoControlFile;
extern sNtQueryInformationFile pNtQueryInformationFile;
extern sNtSetInformationFile pNtSetInformationFile;

View File

@ -216,3 +216,210 @@ int uv_ntstatus_to_winsock_error(NTSTATUS status) {
}
}
}
/*
* This function provides a workaround for a bug in the winsock implementation
* of WSARecv. The problem is that when SetFileCompletionNotificationModes is
* used to avoid IOCP notifications of completed reads, WSARecv does not
* reliably indicate whether we can expect a completion package to be posted
* when the receive buffer is smaller than the received datagram.
*
* However it is desirable to use SetFileCompletionNotificationModes because
* it yields a massive performance increase.
*
* This function provides a workaround for that bug, but it only works for the
* specific case that we need it for. E.g. it assumes that the "avoid iocp"
* bit has been set, and supports only overlapped operation. It also requires
* the user to use the default msafd driver, doesn't work when other LSPs are
* stacked on top of it.
*/
int WSAAPI uv_wsarecv_workaround(SOCKET socket, WSABUF* buffers,
DWORD buffer_count, DWORD* bytes, DWORD* flags, WSAOVERLAPPED *overlapped,
LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine) {
NTSTATUS status;
void* apc_context;
IO_STATUS_BLOCK* iosb = (IO_STATUS_BLOCK*) &overlapped->Internal;
AFD_RECV_INFO info;
DWORD error;
if (overlapped == NULL || completion_routine != NULL) {
WSASetLastError(WSAEINVAL);
return SOCKET_ERROR;
}
info.BufferArray = buffers;
info.BufferCount = buffer_count;
info.AfdFlags = AFD_OVERLAPPED;
info.TdiFlags = TDI_RECEIVE_NORMAL;
if (*flags & MSG_PEEK) {
info.TdiFlags |= TDI_RECEIVE_PEEK;
}
if (*flags & MSG_PARTIAL) {
info.TdiFlags |= TDI_RECEIVE_PARTIAL;
}
if (!((intptr_t) overlapped->hEvent & 1)) {
apc_context = (void*) overlapped;
} else {
apc_context = NULL;
}
iosb->Status = STATUS_PENDING;
iosb->Pointer = 0;
status = pNtDeviceIoControlFile((HANDLE) socket,
overlapped->hEvent,
NULL,
apc_context,
iosb,
IOCTL_AFD_RECEIVE,
&info,
sizeof(info),
NULL,
0);
*flags = 0;
*bytes = (DWORD) iosb->Information;
switch (status) {
case STATUS_SUCCESS:
error = ERROR_SUCCESS;
break;
case STATUS_PENDING:
error = WSA_IO_PENDING;
break;
case STATUS_BUFFER_OVERFLOW:
error = WSAEMSGSIZE;
break;
case STATUS_RECEIVE_EXPEDITED:
error = ERROR_SUCCESS;
*flags = MSG_OOB;
break;
case STATUS_RECEIVE_PARTIAL_EXPEDITED:
error = ERROR_SUCCESS;
*flags = MSG_PARTIAL | MSG_OOB;
break;
case STATUS_RECEIVE_PARTIAL:
error = ERROR_SUCCESS;
*flags = MSG_PARTIAL;
break;
default:
error = uv_ntstatus_to_winsock_error(status);
break;
}
WSASetLastError(error);
if (error == ERROR_SUCCESS) {
return 0;
} else {
return SOCKET_ERROR;
}
}
/* See description of uv_wsarecv_workaround. */
int WSAAPI uv_wsarecvfrom_workaround(SOCKET socket, WSABUF* buffers,
DWORD buffer_count, DWORD* bytes, DWORD* flags, struct sockaddr* addr,
int* addr_len, WSAOVERLAPPED *overlapped,
LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine) {
NTSTATUS status;
void* apc_context;
IO_STATUS_BLOCK* iosb = (IO_STATUS_BLOCK*) &overlapped->Internal;
AFD_RECV_DATAGRAM_INFO info;
DWORD error;
if (overlapped == NULL || addr == NULL || addr_len == NULL ||
completion_routine != NULL) {
WSASetLastError(WSAEINVAL);
return SOCKET_ERROR;
}
info.BufferArray = buffers;
info.BufferCount = buffer_count;
info.AfdFlags = AFD_OVERLAPPED;
info.TdiFlags = TDI_RECEIVE_NORMAL;
info.Address = addr;
info.AddressLength = addr_len;
if (*flags & MSG_PEEK) {
info.TdiFlags |= TDI_RECEIVE_PEEK;
}
if (*flags & MSG_PARTIAL) {
info.TdiFlags |= TDI_RECEIVE_PARTIAL;
}
if (!((intptr_t) overlapped->hEvent & 1)) {
apc_context = (void*) overlapped;
} else {
apc_context = NULL;
}
iosb->Status = STATUS_PENDING;
iosb->Pointer = 0;
status = pNtDeviceIoControlFile((HANDLE) socket,
overlapped->hEvent,
NULL,
apc_context,
iosb,
IOCTL_AFD_RECEIVE_DATAGRAM,
&info,
sizeof(info),
NULL,
0);
*flags = 0;
*bytes = (DWORD) iosb->Information;
switch (status) {
case STATUS_SUCCESS:
error = ERROR_SUCCESS;
break;
case STATUS_PENDING:
error = WSA_IO_PENDING;
break;
case STATUS_BUFFER_OVERFLOW:
error = WSAEMSGSIZE;
break;
case STATUS_RECEIVE_EXPEDITED:
error = ERROR_SUCCESS;
*flags = MSG_OOB;
break;
case STATUS_RECEIVE_PARTIAL_EXPEDITED:
error = ERROR_SUCCESS;
*flags = MSG_PARTIAL | MSG_OOB;
break;
case STATUS_RECEIVE_PARTIAL:
error = ERROR_SUCCESS;
*flags = MSG_PARTIAL;
break;
default:
error = uv_ntstatus_to_winsock_error(status);
break;
}
WSASetLastError(error);
if (error == ERROR_SUCCESS) {
return 0;
} else {
return SOCKET_ERROR;
}
}

View File

@ -39,14 +39,66 @@
#define IPV6_V6ONLY 27
#endif
/* Whether ipv6 is supported */
extern int uv_allow_ipv6;
/*
* TDI defines that are only in the DDK.
* We only need receive flags so far.
*/
#ifndef TDI_RECEIVE_NORMAL
#define TDI_RECEIVE_BROADCAST 0x00000004
#define TDI_RECEIVE_MULTICAST 0x00000008
#define TDI_RECEIVE_PARTIAL 0x00000010
#define TDI_RECEIVE_NORMAL 0x00000020
#define TDI_RECEIVE_EXPEDITED 0x00000040
#define TDI_RECEIVE_PEEK 0x00000080
#define TDI_RECEIVE_NO_RESPONSE_EXP 0x00000100
#define TDI_RECEIVE_COPY_LOOKAHEAD 0x00000200
#define TDI_RECEIVE_ENTIRE_MESSAGE 0x00000400
#define TDI_RECEIVE_AT_DISPATCH_LEVEL 0x00000800
#define TDI_RECEIVE_CONTROL_INFO 0x00001000
#define TDI_RECEIVE_FORCE_INDICATION 0x00002000
#define TDI_RECEIVE_NO_PUSH 0x00004000
#endif
BOOL uv_get_acceptex_function(SOCKET socket, LPFN_ACCEPTEX* target);
BOOL uv_get_connectex_function(SOCKET socket, LPFN_CONNECTEX* target);
/*
* The "Auxiliary Function Driver" is the windows kernel-mode driver that does
* TCP, UDP etc. Winsock is just a layer that dispatches requests to it.
* Having these definitions allows us to bypass winsock and make an AFD kernel
* call directly, avoiding a bug in winsock's recvfrom implementation.
*/
/* Ip address used to bind to any port at any interface */
extern struct sockaddr_in uv_addr_ip4_any_;
extern struct sockaddr_in6 uv_addr_ip6_any_;
#define AFD_NO_FAST_IO 0x00000001
#define AFD_OVERLAPPED 0x00000002
#define AFD_IMMEDIATE 0x00000004
typedef struct _AFD_RECV_DATAGRAM_INFO {
LPWSABUF BufferArray;
ULONG BufferCount;
ULONG AfdFlags;
ULONG TdiFlags;
struct sockaddr* Address;
int* AddressLength;
} AFD_RECV_DATAGRAM_INFO, *PAFD_RECV_DATAGRAM_INFO;
typedef struct _AFD_RECV_INFO {
LPWSABUF BufferArray;
ULONG BufferCount;
ULONG AfdFlags;
ULONG TdiFlags;
} AFD_RECV_INFO, *PAFD_RECV_INFO;
#define _AFD_CONTROL_CODE(operation, method) \
((FSCTL_AFD_BASE) << 12 | (operation << 2) | method)
#define FSCTL_AFD_BASE FILE_DEVICE_NETWORK
#define AFD_RECEIVE 5
#define AFD_RECEIVE_DATAGRAM 6
#define IOCTL_AFD_RECEIVE \
_AFD_CONTROL_CODE(AFD_RECEIVE, METHOD_NEITHER)
#define IOCTL_AFD_RECEIVE_DATAGRAM \
_AFD_CONTROL_CODE(AFD_RECEIVE_DATAGRAM, METHOD_NEITHER)
#endif /* UV_WIN_WINSOCK_H_ */