diff --git a/include/uv-private/uv-win.h b/include/uv-private/uv-win.h index 81693ea8..86a05115 100644 --- a/include/uv-private/uv-win.h +++ b/include/uv-private/uv-win.h @@ -104,6 +104,27 @@ DWORD dwFlags); #endif +typedef int (WSAAPI* LPFN_WSARECV) + (SOCKET socket, + LPWSABUF buffers, + DWORD buffer_count, + LPDWORD bytes, + LPDWORD flags, + LPWSAOVERLAPPED overlapped, + LPWSAOVERLAPPED_COMPLETION_ROUTINE + completion_routine); + +typedef int (WSAAPI* LPFN_WSARECVFROM) + (SOCKET socket, + LPWSABUF buffers, + DWORD buffer_count, + LPDWORD bytes, + LPDWORD flags, + struct sockaddr* addr, + LPINT addr_len, + LPWSAOVERLAPPED overlapped, + LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine); + /** * It should be possible to cast uv_buf_t[] to WSABUF[] @@ -236,7 +257,9 @@ RB_HEAD(uv_timer_tree_s, uv_timer_s); struct sockaddr_storage recv_from; \ int recv_from_len; \ uv_udp_recv_cb recv_cb; \ - uv_alloc_cb alloc_cb; + uv_alloc_cb alloc_cb; \ + LPFN_WSARECV func_wsarecv; \ + LPFN_WSARECVFROM func_wsarecvfrom; #define uv_pipe_server_fields \ uv_pipe_accept_t accept_reqs[4]; \ diff --git a/include/uv.h b/include/uv.h index 05147c5d..3428f56e 100644 --- a/include/uv.h +++ b/include/uv.h @@ -1061,7 +1061,7 @@ int uv_fs_lstat(uv_loop_t* loop, uv_fs_t* req, const char* path, uv_fs_cb cb); int uv_fs_link(uv_loop_t* loop, uv_fs_t* req, const char* path, const char* new_path, uv_fs_cb cb); -/* +/* * This flag can be used with uv_fs_symlink on Windows * to specify whether path argument points to a directory. */ diff --git a/src/win/internal.h b/src/win/internal.h index 4bb36ad8..1f8eb837 100644 --- a/src/win/internal.h +++ b/src/win/internal.h @@ -307,14 +307,40 @@ uv_err_code uv_translate_sys_error(int sys_errno); /* - * Initialization for the windows and winsock api + * Winapi and ntapi utility functions */ void uv_winapi_init(); + + +/* + * Winsock utility functions + */ void uv_winsock_init(); + int uv_ntstatus_to_winsock_error(NTSTATUS status); +BOOL uv_get_acceptex_function(SOCKET socket, LPFN_ACCEPTEX* target); +BOOL uv_get_connectex_function(SOCKET socket, LPFN_CONNECTEX* target); -/* Threads and synchronization */ +int WSAAPI uv_wsarecv_workaround(SOCKET socket, WSABUF* buffers, + DWORD buffer_count, DWORD* bytes, DWORD* flags, WSAOVERLAPPED *overlapped, + LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine); +int WSAAPI uv_wsarecvfrom_workaround(SOCKET socket, WSABUF* buffers, + DWORD buffer_count, DWORD* bytes, DWORD* flags, struct sockaddr* addr, + int* addr_len, WSAOVERLAPPED *overlapped, + LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine); + +/* Whether ipv6 is supported */ +extern int uv_allow_ipv6; + +/* Ip address used to bind to any port at any interface */ +extern struct sockaddr_in uv_addr_ip4_any_; +extern struct sockaddr_in6 uv_addr_ip6_any_; + + +/* + * Threads and synchronization + */ typedef struct uv_once_s { unsigned char ran; /* The actual event handle must be aligned to sizeof(HANDLE), so in */ diff --git a/src/win/udp.c b/src/win/udp.c index cba80e5b..07082ddb 100644 --- a/src/win/udp.c +++ b/src/win/udp.c @@ -24,9 +24,8 @@ #include "uv.h" #include "../uv-common.h" #include "internal.h" -#include -#if 0 + /* * Threshold of active udp streams for which to preallocate udp read buffers. */ @@ -34,7 +33,6 @@ const unsigned int uv_active_udp_streams_threshold = 0; /* A zero-size buffer for use by uv_udp_read */ static char uv_zero_[] = ""; -#endif /* Counter to keep track of active udp streams */ static unsigned int active_udp_streams = 0; @@ -63,6 +61,8 @@ int uv_udp_getsockname(uv_udp_t* handle, struct sockaddr* name, static int uv_udp_set_socket(uv_loop_t* loop, uv_udp_t* handle, SOCKET socket) { DWORD yes = 1; + WSAPROTOCOL_INFOW info; + int opt_len; assert(handle->socket == INVALID_SOCKET); @@ -89,14 +89,33 @@ static int uv_udp_set_socket(uv_loop_t* loop, uv_udp_t* handle, } if (pSetFileCompletionNotificationModes) { - if (pSetFileCompletionNotificationModes((HANDLE)socket, - FILE_SKIP_SET_EVENT_ON_HANDLE | - FILE_SKIP_COMPLETION_PORT_ON_SUCCESS)) { - handle->flags |= UV_HANDLE_SYNC_BYPASS_IOCP; - } else if (GetLastError() != ERROR_INVALID_FUNCTION) { + /* All know windowses that support SetFileCompletionNotificationModes */ + /* have a bug that makes it impossible to use this function in */ + /* conjunction with datagram sockets. We can work around that but only */ + /* if the user is using the default UDP driver (AFD) and has no other */ + /* LSPs stacked on top. Here we check whether that is the case. */ + opt_len = (int) sizeof info; + if (!getsockopt(socket, + SOL_SOCKET, + SO_PROTOCOL_INFOW, + (char*) &info, + &opt_len) == SOCKET_ERROR) { uv__set_sys_error(loop, GetLastError()); return -1; } + + if (info.ProtocolChain.ChainLen == 1) { + if (pSetFileCompletionNotificationModes((HANDLE)socket, + FILE_SKIP_SET_EVENT_ON_HANDLE | + FILE_SKIP_COMPLETION_PORT_ON_SUCCESS)) { + handle->flags |= UV_HANDLE_SYNC_BYPASS_IOCP; + handle->func_wsarecv = uv_wsarecv_workaround; + handle->func_wsarecvfrom = uv_wsarecvfrom_workaround; + } else if (GetLastError() != ERROR_INVALID_FUNCTION) { + uv__set_sys_error(loop, GetLastError()); + return -1; + } + } } handle->socket = socket; @@ -111,6 +130,8 @@ int uv_udp_init(uv_loop_t* loop, uv_udp_t* handle) { handle->reqs_pending = 0; handle->loop = loop; handle->flags = 0; + handle->func_wsarecv = WSARecv; + handle->func_wsarecvfrom = WSARecvFrom; uv_req_init(loop, (uv_req_t*) &(handle->recv_req)); handle->recv_req.type = UV_UDP_RECV; @@ -248,10 +269,9 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) { * Preallocate a read buffer if the number of active streams is below * the threshold. */ -#if 0 if (active_udp_streams < uv_active_udp_streams_threshold) { handle->flags &= ~UV_HANDLE_ZERO_READ; -#endif + handle->recv_buffer = handle->alloc_cb((uv_handle_t*) handle, 65536); assert(handle->recv_buffer.len > 0); @@ -260,15 +280,15 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) { handle->recv_from_len = sizeof handle->recv_from; flags = 0; - result = WSARecvFrom(handle->socket, - (WSABUF*) &buf, - 1, - &bytes, - &flags, - (struct sockaddr*) &handle->recv_from, - &handle->recv_from_len, - &req->overlapped, - NULL); + result = handle->func_wsarecvfrom(handle->socket, + (WSABUF*) &buf, + 1, + &bytes, + &flags, + (struct sockaddr*) &handle->recv_from, + &handle->recv_from_len, + &req->overlapped, + NULL); if (UV_SUCCEEDED_WITHOUT_IOCP(result == 0)) { /* Process the req without IOCP. */ @@ -286,21 +306,21 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) { uv_insert_pending_req(loop, req); handle->reqs_pending++; } -#if 0 + } else { handle->flags |= UV_HANDLE_ZERO_READ; buf.base = (char*) uv_zero_; buf.len = 0; - flags = MSG_PARTIAL; + flags = MSG_PEEK; - result = WSARecv(handle->socket, - (WSABUF*) &buf, - 1, - &bytes, - &flags, - &req->overlapped, - NULL); + result = handle->func_wsarecv(handle->socket, + (WSABUF*) &buf, + 1, + &bytes, + &flags, + &req->overlapped, + NULL); if (UV_SUCCEEDED_WITHOUT_IOCP(result == 0)) { /* Process the req without IOCP. */ @@ -319,7 +339,6 @@ static void uv_udp_queue_recv(uv_loop_t* loop, uv_udp_t* handle) { handle->reqs_pending++; } } -#endif } @@ -448,34 +467,27 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle, handle->flags &= ~UV_HANDLE_READ_PENDING; if (!REQ_SUCCESS(req) && - GET_REQ_STATUS(req) != STATUS_RECEIVE_EXPEDITED) { + GET_REQ_SOCK_ERROR(req) != WSAEMSGSIZE) { /* An error occurred doing the read. */ - if ((handle->flags & UV_HANDLE_READING)) { - uv__set_sys_error(loop, GET_REQ_SOCK_ERROR(req)); + if (handle->flags & UV_HANDLE_READING) { + uv__set_sys_error(loop, GET_REQ_SOCK_ERROR(req)); uv_udp_recv_stop(handle); -#if 0 buf = (handle->flags & UV_HANDLE_ZERO_READ) ? uv_buf_init(NULL, 0) : handle->recv_buffer; -#else - buf = handle->recv_buffer; -#endif handle->recv_cb(handle, -1, buf, NULL, 0); } goto done; } -#if 0 if (!(handle->flags & UV_HANDLE_ZERO_READ)) { -#endif /* Successful read */ - partial = (GET_REQ_STATUS(req) == STATUS_RECEIVE_EXPEDITED); + partial = !REQ_SUCCESS(req); handle->recv_cb(handle, req->overlapped.InternalHigh, handle->recv_buffer, (struct sockaddr*) &handle->recv_from, partial ? UV_UDP_PARTIAL : 0); -#if 0 - } else { + } else if (handle->flags & UV_HANDLE_READING) { DWORD bytes, err, flags; struct sockaddr_storage from; int from_len; @@ -487,7 +499,8 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle, memset(&from, 0, sizeof from); from_len = sizeof from; - flags = MSG_PARTIAL; + + flags = 0; if (WSARecvFrom(handle->socket, (WSABUF*)&buf, @@ -500,14 +513,18 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle, NULL) != SOCKET_ERROR) { /* Message received */ - handle->recv_cb(handle, - bytes, - buf, - (struct sockaddr*) &from, - (flags & MSG_PARTIAL) ? UV_UDP_PARTIAL : 0); + handle->recv_cb(handle, bytes, buf, (struct sockaddr*) &from, 0); } else { err = WSAGetLastError(); - if (err == WSAEWOULDBLOCK) { + if (err == WSAEMSGSIZE) { + /* Message truncated */ + handle->recv_cb(handle, + bytes, + buf, + (struct sockaddr*) &from, + UV_UDP_PARTIAL); + } if (err == WSAEWOULDBLOCK) { + /* Kernel buffer empty */ uv__set_sys_error(loop, WSAEWOULDBLOCK); handle->recv_cb(handle, 0, buf, NULL, 0); } else { @@ -517,7 +534,6 @@ void uv_process_udp_recv_req(uv_loop_t* loop, uv_udp_t* handle, } } } -#endif done: /* Post another read if still reading and not closing. */ diff --git a/src/win/winapi.c b/src/win/winapi.c index 4f8597cc..cc21361b 100644 --- a/src/win/winapi.c +++ b/src/win/winapi.c @@ -27,6 +27,7 @@ sRtlNtStatusToDosError pRtlNtStatusToDosError; +sNtDeviceIoControlFile pNtDeviceIoControlFile; sNtQueryInformationFile pNtQueryInformationFile; sNtSetInformationFile pNtSetInformationFile; sGetQueuedCompletionStatusEx pGetQueuedCompletionStatusEx; @@ -57,6 +58,13 @@ void uv_winapi_init() { uv_fatal_error(GetLastError(), "GetProcAddress"); } + pNtDeviceIoControlFile = (sNtDeviceIoControlFile) GetProcAddress( + ntdll_module, + "NtDeviceIoControlFile"); + if (pNtDeviceIoControlFile == NULL) { + uv_fatal_error(GetLastError(), "GetProcAddress"); + } + pNtSetInformationFile = (sNtSetInformationFile) GetProcAddress( ntdll_module, "NtSetInformationFile"); diff --git a/src/win/winapi.h b/src/win/winapi.h index 78ffe165..8434363e 100644 --- a/src/win/winapi.h +++ b/src/win/winapi.h @@ -4270,9 +4270,26 @@ typedef enum _FILE_INFORMATION_CLASS { FILE_SPECIAL_ACCESS) #endif +typedef VOID (NTAPI *PIO_APC_ROUTINE) + (PVOID ApcContext, + PIO_STATUS_BLOCK IoStatusBlock, + ULONG Reserved); + typedef ULONG (NTAPI *sRtlNtStatusToDosError) (NTSTATUS Status); +typedef NTSTATUS (NTAPI *sNtDeviceIoControlFile) + (HANDLE FileHandle, + HANDLE Event, + PIO_APC_ROUTINE ApcRoutine, + PVOID ApcContext, + PIO_STATUS_BLOCK IoStatusBlock, + ULONG IoControlCode, + PVOID InputBuffer, + ULONG InputBufferLength, + PVOID OutputBuffer, + ULONG OutputBufferLength); + typedef NTSTATUS (NTAPI *sNtQueryInformationFile) (HANDLE FileHandle, PIO_STATUS_BLOCK IoStatusBlock, @@ -4325,6 +4342,7 @@ typedef BOOLEAN (WINAPI* sCreateSymbolicLinkW) /* Ntapi function pointers */ extern sRtlNtStatusToDosError pRtlNtStatusToDosError; +extern sNtDeviceIoControlFile pNtDeviceIoControlFile; extern sNtQueryInformationFile pNtQueryInformationFile; extern sNtSetInformationFile pNtSetInformationFile; diff --git a/src/win/winsock.c b/src/win/winsock.c index e37a60a9..5309f1ee 100644 --- a/src/win/winsock.c +++ b/src/win/winsock.c @@ -216,3 +216,210 @@ int uv_ntstatus_to_winsock_error(NTSTATUS status) { } } } + + +/* + * This function provides a workaround for a bug in the winsock implementation + * of WSARecv. The problem is that when SetFileCompletionNotificationModes is + * used to avoid IOCP notifications of completed reads, WSARecv does not + * reliably indicate whether we can expect a completion package to be posted + * when the receive buffer is smaller than the received datagram. + * + * However it is desirable to use SetFileCompletionNotificationModes because + * it yields a massive performance increase. + * + * This function provides a workaround for that bug, but it only works for the + * specific case that we need it for. E.g. it assumes that the "avoid iocp" + * bit has been set, and supports only overlapped operation. It also requires + * the user to use the default msafd driver, doesn't work when other LSPs are + * stacked on top of it. + */ +int WSAAPI uv_wsarecv_workaround(SOCKET socket, WSABUF* buffers, + DWORD buffer_count, DWORD* bytes, DWORD* flags, WSAOVERLAPPED *overlapped, + LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine) { + NTSTATUS status; + void* apc_context; + IO_STATUS_BLOCK* iosb = (IO_STATUS_BLOCK*) &overlapped->Internal; + AFD_RECV_INFO info; + DWORD error; + + if (overlapped == NULL || completion_routine != NULL) { + WSASetLastError(WSAEINVAL); + return SOCKET_ERROR; + } + + info.BufferArray = buffers; + info.BufferCount = buffer_count; + info.AfdFlags = AFD_OVERLAPPED; + info.TdiFlags = TDI_RECEIVE_NORMAL; + + if (*flags & MSG_PEEK) { + info.TdiFlags |= TDI_RECEIVE_PEEK; + } + + if (*flags & MSG_PARTIAL) { + info.TdiFlags |= TDI_RECEIVE_PARTIAL; + } + + if (!((intptr_t) overlapped->hEvent & 1)) { + apc_context = (void*) overlapped; + } else { + apc_context = NULL; + } + + iosb->Status = STATUS_PENDING; + iosb->Pointer = 0; + + status = pNtDeviceIoControlFile((HANDLE) socket, + overlapped->hEvent, + NULL, + apc_context, + iosb, + IOCTL_AFD_RECEIVE, + &info, + sizeof(info), + NULL, + 0); + + *flags = 0; + *bytes = (DWORD) iosb->Information; + + switch (status) { + case STATUS_SUCCESS: + error = ERROR_SUCCESS; + break; + + case STATUS_PENDING: + error = WSA_IO_PENDING; + break; + + case STATUS_BUFFER_OVERFLOW: + error = WSAEMSGSIZE; + break; + + case STATUS_RECEIVE_EXPEDITED: + error = ERROR_SUCCESS; + *flags = MSG_OOB; + break; + + case STATUS_RECEIVE_PARTIAL_EXPEDITED: + error = ERROR_SUCCESS; + *flags = MSG_PARTIAL | MSG_OOB; + break; + + case STATUS_RECEIVE_PARTIAL: + error = ERROR_SUCCESS; + *flags = MSG_PARTIAL; + break; + + default: + error = uv_ntstatus_to_winsock_error(status); + break; + } + + WSASetLastError(error); + + if (error == ERROR_SUCCESS) { + return 0; + } else { + return SOCKET_ERROR; + } +} + + +/* See description of uv_wsarecv_workaround. */ +int WSAAPI uv_wsarecvfrom_workaround(SOCKET socket, WSABUF* buffers, + DWORD buffer_count, DWORD* bytes, DWORD* flags, struct sockaddr* addr, + int* addr_len, WSAOVERLAPPED *overlapped, + LPWSAOVERLAPPED_COMPLETION_ROUTINE completion_routine) { + NTSTATUS status; + void* apc_context; + IO_STATUS_BLOCK* iosb = (IO_STATUS_BLOCK*) &overlapped->Internal; + AFD_RECV_DATAGRAM_INFO info; + DWORD error; + + if (overlapped == NULL || addr == NULL || addr_len == NULL || + completion_routine != NULL) { + WSASetLastError(WSAEINVAL); + return SOCKET_ERROR; + } + + info.BufferArray = buffers; + info.BufferCount = buffer_count; + info.AfdFlags = AFD_OVERLAPPED; + info.TdiFlags = TDI_RECEIVE_NORMAL; + info.Address = addr; + info.AddressLength = addr_len; + + if (*flags & MSG_PEEK) { + info.TdiFlags |= TDI_RECEIVE_PEEK; + } + + if (*flags & MSG_PARTIAL) { + info.TdiFlags |= TDI_RECEIVE_PARTIAL; + } + + if (!((intptr_t) overlapped->hEvent & 1)) { + apc_context = (void*) overlapped; + } else { + apc_context = NULL; + } + + iosb->Status = STATUS_PENDING; + iosb->Pointer = 0; + + status = pNtDeviceIoControlFile((HANDLE) socket, + overlapped->hEvent, + NULL, + apc_context, + iosb, + IOCTL_AFD_RECEIVE_DATAGRAM, + &info, + sizeof(info), + NULL, + 0); + + *flags = 0; + *bytes = (DWORD) iosb->Information; + + switch (status) { + case STATUS_SUCCESS: + error = ERROR_SUCCESS; + break; + + case STATUS_PENDING: + error = WSA_IO_PENDING; + break; + + case STATUS_BUFFER_OVERFLOW: + error = WSAEMSGSIZE; + break; + + case STATUS_RECEIVE_EXPEDITED: + error = ERROR_SUCCESS; + *flags = MSG_OOB; + break; + + case STATUS_RECEIVE_PARTIAL_EXPEDITED: + error = ERROR_SUCCESS; + *flags = MSG_PARTIAL | MSG_OOB; + break; + + case STATUS_RECEIVE_PARTIAL: + error = ERROR_SUCCESS; + *flags = MSG_PARTIAL; + break; + + default: + error = uv_ntstatus_to_winsock_error(status); + break; + } + + WSASetLastError(error); + + if (error == ERROR_SUCCESS) { + return 0; + } else { + return SOCKET_ERROR; + } +} diff --git a/src/win/winsock.h b/src/win/winsock.h index 1927d656..18978cf3 100644 --- a/src/win/winsock.h +++ b/src/win/winsock.h @@ -39,14 +39,66 @@ #define IPV6_V6ONLY 27 #endif -/* Whether ipv6 is supported */ -extern int uv_allow_ipv6; +/* + * TDI defines that are only in the DDK. + * We only need receive flags so far. + */ +#ifndef TDI_RECEIVE_NORMAL + #define TDI_RECEIVE_BROADCAST 0x00000004 + #define TDI_RECEIVE_MULTICAST 0x00000008 + #define TDI_RECEIVE_PARTIAL 0x00000010 + #define TDI_RECEIVE_NORMAL 0x00000020 + #define TDI_RECEIVE_EXPEDITED 0x00000040 + #define TDI_RECEIVE_PEEK 0x00000080 + #define TDI_RECEIVE_NO_RESPONSE_EXP 0x00000100 + #define TDI_RECEIVE_COPY_LOOKAHEAD 0x00000200 + #define TDI_RECEIVE_ENTIRE_MESSAGE 0x00000400 + #define TDI_RECEIVE_AT_DISPATCH_LEVEL 0x00000800 + #define TDI_RECEIVE_CONTROL_INFO 0x00001000 + #define TDI_RECEIVE_FORCE_INDICATION 0x00002000 + #define TDI_RECEIVE_NO_PUSH 0x00004000 +#endif -BOOL uv_get_acceptex_function(SOCKET socket, LPFN_ACCEPTEX* target); -BOOL uv_get_connectex_function(SOCKET socket, LPFN_CONNECTEX* target); +/* + * The "Auxiliary Function Driver" is the windows kernel-mode driver that does + * TCP, UDP etc. Winsock is just a layer that dispatches requests to it. + * Having these definitions allows us to bypass winsock and make an AFD kernel + * call directly, avoiding a bug in winsock's recvfrom implementation. + */ -/* Ip address used to bind to any port at any interface */ -extern struct sockaddr_in uv_addr_ip4_any_; -extern struct sockaddr_in6 uv_addr_ip6_any_; +#define AFD_NO_FAST_IO 0x00000001 +#define AFD_OVERLAPPED 0x00000002 +#define AFD_IMMEDIATE 0x00000004 + +typedef struct _AFD_RECV_DATAGRAM_INFO { + LPWSABUF BufferArray; + ULONG BufferCount; + ULONG AfdFlags; + ULONG TdiFlags; + struct sockaddr* Address; + int* AddressLength; +} AFD_RECV_DATAGRAM_INFO, *PAFD_RECV_DATAGRAM_INFO; + +typedef struct _AFD_RECV_INFO { + LPWSABUF BufferArray; + ULONG BufferCount; + ULONG AfdFlags; + ULONG TdiFlags; +} AFD_RECV_INFO, *PAFD_RECV_INFO; + + +#define _AFD_CONTROL_CODE(operation, method) \ + ((FSCTL_AFD_BASE) << 12 | (operation << 2) | method) + +#define FSCTL_AFD_BASE FILE_DEVICE_NETWORK + +#define AFD_RECEIVE 5 +#define AFD_RECEIVE_DATAGRAM 6 + +#define IOCTL_AFD_RECEIVE \ + _AFD_CONTROL_CODE(AFD_RECEIVE, METHOD_NEITHER) + +#define IOCTL_AFD_RECEIVE_DATAGRAM \ + _AFD_CONTROL_CODE(AFD_RECEIVE_DATAGRAM, METHOD_NEITHER) #endif /* UV_WIN_WINSOCK_H_ */