1507 lines
53 KiB
C
1507 lines
53 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
|
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
|
╚──────────────────────────────────────────────────────────────────────────────╝
|
|
│ │
|
|
│ wepoll │
|
|
│ https://github.com/piscisaureus/wepoll │
|
|
│ │
|
|
│ Copyright 2012-2020, Bert Belder <bertbelder@gmail.com> │
|
|
│ All rights reserved. │
|
|
│ │
|
|
│ Redistribution and use in source and binary forms, with or without │
|
|
│ modification, are permitted provided that the following conditions are │
|
|
│ met: │
|
|
│ │
|
|
│ * Redistributions of source code must retain the above copyright │
|
|
│ notice, this list of conditions and the following disclaimer. │
|
|
│ │
|
|
│ * Redistributions in binary form must reproduce the above copyright │
|
|
│ notice, this list of conditions and the following disclaimer in the │
|
|
│ documentation and/or other materials provided with the distribution. │
|
|
│ │
|
|
│ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │
|
|
│ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │
|
|
│ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │
|
|
│ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │
|
|
│ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │
|
|
│ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │
|
|
│ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
|
|
│ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
|
|
│ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │
|
|
│ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │
|
|
│ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │
|
|
│ │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/assert.h"
|
|
#include "libc/calls/internal.h"
|
|
#include "libc/dce.h"
|
|
#include "libc/errno.h"
|
|
#include "libc/limits.h"
|
|
#include "libc/macros.h"
|
|
#include "libc/mem/mem.h"
|
|
#include "libc/nt/enum/accessmask.h"
|
|
#include "libc/nt/enum/afd.h"
|
|
#include "libc/nt/enum/filesharemode.h"
|
|
#include "libc/nt/enum/ioctl.h"
|
|
#include "libc/nt/enum/keyedevent.h"
|
|
#include "libc/nt/enum/sio.h"
|
|
#include "libc/nt/enum/status.h"
|
|
#include "libc/nt/enum/wait.h"
|
|
#include "libc/nt/errors.h"
|
|
#include "libc/nt/files.h"
|
|
#include "libc/nt/iocp.h"
|
|
#include "libc/nt/nt/file.h"
|
|
#include "libc/nt/nt/key.h"
|
|
#include "libc/nt/ntdll.h"
|
|
#include "libc/nt/process.h"
|
|
#include "libc/nt/runtime.h"
|
|
#include "libc/nt/struct/afd.h"
|
|
#include "libc/nt/struct/criticalsection.h"
|
|
#include "libc/nt/struct/objectattributes.h"
|
|
#include "libc/nt/struct/overlappedentry.h"
|
|
#include "libc/nt/struct/unicodestring.h"
|
|
#include "libc/nt/synchronization.h"
|
|
#include "libc/nt/winsock.h"
|
|
#include "libc/runtime/runtime.h"
|
|
#include "libc/sock/epoll.h"
|
|
#include "libc/sock/internal.h"
|
|
#include "libc/str/str.h"
|
|
#include "libc/sysv/consts/epoll.h"
|
|
#include "libc/sysv/errfuns.h"
|
|
|
|
/**
|
|
* @fileoverview epoll
|
|
*
|
|
* This is an alternative to poll() that's popular for event driven
|
|
* network servers that want >10,000 sockets per machine and don't do
|
|
* cpu bound computations that would otherwise block the event loop.
|
|
*
|
|
* This works on Linux and is polyfilled on Windows. It's worth noting
|
|
* that these polyfills depend on Microsoft's internal APIs. However
|
|
* these particular NTDLL APIs are also used by libuv, nodejs, etc. so
|
|
* we're reasonably certain Microsoft has compatibility policies in
|
|
* place where they've promised not to break them.
|
|
*
|
|
* TODO(jart): Polyfill kqueue for XNU/FreeBSD/OpenBSD.
|
|
*/
|
|
|
|
asm(".ident\t\"\\n\\n\
|
|
wepoll (BSD-2)\\n\
|
|
Copyright 2012-2020 Bert Belder\\n\
|
|
https://github.com/piscisaureus/wepoll\"");
|
|
asm(".include \"libc/disclaimer.inc\"");
|
|
|
|
#define MAX_GROUP_SIZE 32
|
|
|
|
#define REFLOCK__REF 0x00000001
|
|
#define REFLOCK__REF_MASK 0x0fffffff
|
|
#define REFLOCK__DESTROY 0x10000000
|
|
#define REFLOCK__DESTROY_MASK 0xf0000000
|
|
#define REFLOCK__POISON 0x300dead0
|
|
|
|
#define KNOWN_EVENTS \
|
|
(EPOLLIN | EPOLLPRI | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | \
|
|
EPOLLRDBAND | EPOLLWRNORM | EPOLLWRBAND | EPOLLMSG | EPOLLRDHUP)
|
|
|
|
#define RTL_CONSTANT_STRING(s) \
|
|
{ sizeof(s) - sizeof((s)[0]), sizeof(s), s }
|
|
|
|
#define RTL_CONSTANT_OBJECT_ATTRIBUTES(ObjectName, Attributes) \
|
|
{ sizeof(struct NtObjectAttributes), 0, ObjectName, Attributes, NULL, NULL }
|
|
|
|
#define RETURN_MAP_ERROR(value) \
|
|
do { \
|
|
err_map_win_error(); \
|
|
return value; \
|
|
} while (0)
|
|
|
|
#define RETURN_SET_ERROR(value, error) \
|
|
do { \
|
|
err_set_win_error(error); \
|
|
return value; \
|
|
} while (0)
|
|
|
|
#define CONTAINOF(ptr, type, member) \
|
|
((type *)((uintptr_t)(ptr)-offsetof(type, member)))
|
|
|
|
#define TREE__ROTATE(cis, trans) \
|
|
struct TreeNode *p = node; \
|
|
struct TreeNode *q = node->trans; \
|
|
struct TreeNode *parent = p->parent; \
|
|
if (parent) { \
|
|
if (parent->left == p) \
|
|
parent->left = q; \
|
|
else \
|
|
parent->right = q; \
|
|
} else { \
|
|
tree->root = q; \
|
|
} \
|
|
q->parent = parent; \
|
|
p->parent = q; \
|
|
p->trans = q->cis; \
|
|
if (p->trans) p->trans->parent = p; \
|
|
q->cis = p;
|
|
|
|
#define TREE__INSERT_OR_DESCEND(side) \
|
|
if (parent->side) { \
|
|
parent = parent->side; \
|
|
} else { \
|
|
parent->side = node; \
|
|
break; \
|
|
}
|
|
|
|
#define TREE__REBALANCE_AFTER_INSERT(cis, trans) \
|
|
struct TreeNode *grandparent = parent->parent; \
|
|
struct TreeNode *uncle = grandparent->trans; \
|
|
if (uncle && uncle->red) { \
|
|
parent->red = uncle->red = false; \
|
|
grandparent->red = true; \
|
|
node = grandparent; \
|
|
} else { \
|
|
if (node == parent->trans) { \
|
|
tree__rotate_##cis(tree, parent); \
|
|
node = parent; \
|
|
parent = node->parent; \
|
|
} \
|
|
parent->red = false; \
|
|
grandparent->red = true; \
|
|
tree__rotate_##trans(tree, grandparent); \
|
|
}
|
|
|
|
#define TREE__REBALANCE_AFTER_REMOVE(cis, trans) \
|
|
struct TreeNode *sibling = parent->trans; \
|
|
if (sibling->red) { \
|
|
sibling->red = false; \
|
|
parent->red = true; \
|
|
tree__rotate_##cis(tree, parent); \
|
|
sibling = parent->trans; \
|
|
} \
|
|
if ((sibling->left && sibling->left->red) || \
|
|
(sibling->right && sibling->right->red)) { \
|
|
if (!sibling->trans || !sibling->trans->red) { \
|
|
sibling->cis->red = false; \
|
|
sibling->red = true; \
|
|
tree__rotate_##trans(tree, sibling); \
|
|
sibling = parent->trans; \
|
|
} \
|
|
sibling->red = parent->red; \
|
|
parent->red = sibling->trans->red = false; \
|
|
tree__rotate_##cis(tree, parent); \
|
|
node = tree->root; \
|
|
break; \
|
|
} \
|
|
sibling->red = true;
|
|
|
|
#define tree_root(t) (t)->root
|
|
#define port_state_to_handle_tree_node(p) (&(p)->handle_tree_node)
|
|
#define sock_state_from_queue_node(q) CONTAINOF(q, struct SockState, queue_node)
|
|
#define sock_state_to_queue_node(s) (&(s)->queue_node)
|
|
#define sock_state_from_tree_node(t) CONTAINOF(t, struct SockState, tree_node)
|
|
#define sock_state_to_tree_node(s) (&(s)->tree_node)
|
|
#define poll_group_from_queue_node(q) CONTAINOF(q, struct PollGroup, queue_node)
|
|
#define poll_group_get_afd_device_handle(pg) (pg)->afd_device_handle
|
|
|
|
enum PollStatus {
|
|
kPollIdle,
|
|
kPollPending,
|
|
kPollCancelled,
|
|
};
|
|
|
|
struct RefLock {
|
|
int state;
|
|
};
|
|
|
|
struct TreeNode {
|
|
struct TreeNode *left;
|
|
struct TreeNode *right;
|
|
struct TreeNode *parent;
|
|
uintptr_t key;
|
|
bool red;
|
|
};
|
|
|
|
struct Tree {
|
|
struct TreeNode *root;
|
|
};
|
|
|
|
struct TsTree {
|
|
struct Tree tree;
|
|
intptr_t lock;
|
|
};
|
|
|
|
struct TsTreeNode {
|
|
struct TreeNode tree_node;
|
|
struct RefLock reflock;
|
|
};
|
|
|
|
struct QueueNode {
|
|
struct QueueNode *prev;
|
|
struct QueueNode *next;
|
|
};
|
|
|
|
struct Queue {
|
|
struct QueueNode head;
|
|
};
|
|
|
|
struct PortState {
|
|
int64_t iocp_handle;
|
|
struct Tree sock_tree;
|
|
struct Queue sock_update_queue;
|
|
struct Queue sock_deleted_queue;
|
|
struct Queue poll_group_queue;
|
|
struct TsTreeNode handle_tree_node;
|
|
struct NtCriticalSection lock;
|
|
size_t active_poll_count;
|
|
};
|
|
|
|
struct PollGroup {
|
|
struct PortState *port_state;
|
|
struct QueueNode queue_node;
|
|
int64_t afd_device_handle;
|
|
size_t group_size;
|
|
};
|
|
|
|
struct SockState {
|
|
struct NtIoStatusBlock io_status_block;
|
|
struct NtAfdPollInfo poll_info;
|
|
struct QueueNode queue_node;
|
|
struct TreeNode tree_node;
|
|
struct PollGroup *poll_group;
|
|
int64_t base_socket;
|
|
epoll_data_t user_data;
|
|
uint32_t user_events;
|
|
uint32_t pending_events;
|
|
enum PollStatus poll_status;
|
|
bool delete_pending;
|
|
};
|
|
|
|
static const struct NtUnicodeString afd__device_name =
|
|
RTL_CONSTANT_STRING(u"\\Device\\Afd\\Wepoll");
|
|
|
|
static const struct NtObjectAttributes afd__device_attributes =
|
|
RTL_CONSTANT_OBJECT_ATTRIBUTES(&afd__device_name, 0);
|
|
|
|
static int64_t reflock__keyed_event;
|
|
static struct TsTree epoll__handle_tree;
|
|
|
|
static textwindows void err_map_win_error(void) {
|
|
errno = MapDosErrorToErrno(GetLastError());
|
|
}
|
|
|
|
static textwindows void err_set_win_error(uint32_t error) {
|
|
SetLastError(error);
|
|
errno = MapDosErrorToErrno(error);
|
|
}
|
|
|
|
static textwindows int err_check_handle(int64_t handle) {
|
|
uint32_t flags;
|
|
/* GetHandleInformation() succeeds when passed INVALID_HANDLE_VALUE,
|
|
so check for this condition explicitly. */
|
|
if (handle == kNtInvalidHandleValue) {
|
|
RETURN_SET_ERROR(-1, kNtErrorInvalidHandle);
|
|
}
|
|
if (!GetHandleInformation(handle, &flags)) {
|
|
RETURN_MAP_ERROR(-1);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static textwindows void tree_init(struct Tree *tree) {
|
|
memset(tree, 0, sizeof *tree);
|
|
}
|
|
|
|
static textwindows void ts_tree_init(struct TsTree *ts_tree) {
|
|
tree_init(&ts_tree->tree);
|
|
InitializeSRWLock(&ts_tree->lock);
|
|
}
|
|
|
|
static textwindows int reflock_global_init(void) {
|
|
NtStatus status;
|
|
if ((status = NtCreateKeyedEvent(&reflock__keyed_event,
|
|
kNtKeyedeventAllAccess, NULL, 0)) !=
|
|
kNtStatusSuccess) {
|
|
RETURN_SET_ERROR(-1, RtlNtStatusToDosError(status));
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int epoll_global_init(void) {
|
|
ts_tree_init(&epoll__handle_tree);
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int wepoll_init(void) {
|
|
static bool once;
|
|
static bool result;
|
|
if (!once) {
|
|
if (reflock_global_init() < 0 || epoll_global_init() < 0) {
|
|
result = false;
|
|
} else {
|
|
result = true;
|
|
}
|
|
once = true;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static textwindows int afd_create_device_handle(
|
|
int64_t iocp_handle, int64_t *afd_device_handle_out) {
|
|
NtStatus status;
|
|
int64_t afd_device_handle;
|
|
struct NtIoStatusBlock iosb;
|
|
/* By opening \Device\Afd without specifying any extended attributes,
|
|
we'll get a handle that lets us talk to the AFD driver, but that
|
|
doesn't have an *associated endpoint (so it's not a socket). */
|
|
status = NtCreateFile(&afd_device_handle, kNtSynchronize,
|
|
&afd__device_attributes, &iosb, NULL, 0,
|
|
kNtFileShareRead | kNtFileShareWrite, 1, 0, NULL, 0);
|
|
if (status != kNtStatusSuccess) {
|
|
RETURN_SET_ERROR(-1, RtlNtStatusToDosError(status));
|
|
}
|
|
if (!CreateIoCompletionPort(afd_device_handle, iocp_handle, 0, 0)) {
|
|
goto error;
|
|
}
|
|
if (!SetFileCompletionNotificationModes(afd_device_handle,
|
|
kNtFileSkipSetEventOnHandle)) {
|
|
goto error;
|
|
}
|
|
*afd_device_handle_out = afd_device_handle;
|
|
return 0;
|
|
error:
|
|
CloseHandle(afd_device_handle);
|
|
RETURN_MAP_ERROR(-1);
|
|
}
|
|
|
|
static textwindows int afd_poll(int64_t afd_device_handle,
|
|
struct NtAfdPollInfo *poll_info,
|
|
struct NtIoStatusBlock *io_status_block) {
|
|
NtStatus status;
|
|
/* Blocking operation is not supported.*/
|
|
assert(io_status_block);
|
|
io_status_block->Status = kNtStatusPending;
|
|
status =
|
|
NtDeviceIoControlFile(afd_device_handle, 0, NULL, io_status_block,
|
|
io_status_block, kNtIoctlAfdPoll, poll_info,
|
|
sizeof(*poll_info), poll_info, sizeof(*poll_info));
|
|
if (status == kNtStatusSuccess) {
|
|
return 0;
|
|
} else if (status == kNtStatusPending) {
|
|
RETURN_SET_ERROR(-1, kNtErrorIoPending);
|
|
} else {
|
|
RETURN_SET_ERROR(-1, RtlNtStatusToDosError(status));
|
|
}
|
|
}
|
|
|
|
static textwindows int afd_cancel_poll(
|
|
int64_t afd_device_handle, struct NtIoStatusBlock *io_status_block) {
|
|
NtStatus cancel_status;
|
|
struct NtIoStatusBlock cancel_iosb;
|
|
/* If the poll operation has already completed or has been cancelled
|
|
earlier, there's nothing left for us to do. */
|
|
if (io_status_block->Status != kNtStatusPending) return 0;
|
|
cancel_status =
|
|
NtCancelIoFileEx(afd_device_handle, io_status_block, &cancel_iosb);
|
|
/* NtCancelIoFileEx() may return STATUS_NOT_FOUND if the operation completed
|
|
just before calling NtCancelIoFileEx(). This is not an error. */
|
|
if (cancel_status == kNtStatusSuccess || cancel_status == kNtStatusNotFound) {
|
|
return 0;
|
|
} else {
|
|
RETURN_SET_ERROR(-1, RtlNtStatusToDosError(cancel_status));
|
|
}
|
|
}
|
|
|
|
static textwindows void queue_node_init(struct QueueNode *node) {
|
|
node->prev = node;
|
|
node->next = node;
|
|
}
|
|
|
|
static textwindows void queue_init(struct Queue *queue) {
|
|
queue_node_init(&queue->head);
|
|
}
|
|
|
|
static textwindows void queue__detach_node(struct QueueNode *node) {
|
|
node->prev->next = node->next;
|
|
node->next->prev = node->prev;
|
|
}
|
|
|
|
static textwindows bool queue_is_enqueued(const struct QueueNode *node) {
|
|
return node->prev != node;
|
|
}
|
|
|
|
static textwindows bool queue_is_empty(const struct Queue *queue) {
|
|
return !queue_is_enqueued(&queue->head);
|
|
}
|
|
|
|
static textwindows struct QueueNode *queue_first(const struct Queue *queue) {
|
|
return !queue_is_empty(queue) ? queue->head.next : NULL;
|
|
}
|
|
|
|
static textwindows struct QueueNode *queue_last(const struct Queue *queue) {
|
|
return !queue_is_empty(queue) ? queue->head.prev : NULL;
|
|
}
|
|
|
|
static textwindows void queue_prepend(struct Queue *queue,
|
|
struct QueueNode *node) {
|
|
node->next = queue->head.next;
|
|
node->prev = &queue->head;
|
|
node->next->prev = node;
|
|
queue->head.next = node;
|
|
}
|
|
|
|
static textwindows void queue_append(struct Queue *queue,
|
|
struct QueueNode *node) {
|
|
node->next = &queue->head;
|
|
node->prev = queue->head.prev;
|
|
node->prev->next = node;
|
|
queue->head.prev = node;
|
|
}
|
|
|
|
static textwindows void queue_move_to_start(struct Queue *queue,
|
|
struct QueueNode *node) {
|
|
queue__detach_node(node);
|
|
queue_prepend(queue, node);
|
|
}
|
|
|
|
static textwindows void queue_move_to_end(struct Queue *queue,
|
|
struct QueueNode *node) {
|
|
queue__detach_node(node);
|
|
queue_append(queue, node);
|
|
}
|
|
|
|
static textwindows void queue_remove(struct QueueNode *node) {
|
|
queue__detach_node(node);
|
|
queue_node_init(node);
|
|
}
|
|
|
|
static textwindows struct PortState *port__alloc(void) {
|
|
struct PortState *port_state = malloc(sizeof *port_state);
|
|
if (!port_state) RETURN_SET_ERROR(NULL, kNtErrorNotEnoughMemory);
|
|
return port_state;
|
|
}
|
|
|
|
static textwindows int64_t port__create_iocp(void) {
|
|
int64_t iocp_handle = CreateIoCompletionPort(kNtInvalidHandleValue, 0, 0, 0);
|
|
if (!iocp_handle) RETURN_MAP_ERROR(0);
|
|
return iocp_handle;
|
|
}
|
|
|
|
static textwindows int port__close_iocp(struct PortState *port_state) {
|
|
int64_t iocp_handle = port_state->iocp_handle;
|
|
port_state->iocp_handle = 0;
|
|
if (!CloseHandle(iocp_handle)) RETURN_MAP_ERROR(-1);
|
|
return 0;
|
|
}
|
|
|
|
static textwindows void tree_node_init(struct TreeNode *node) {
|
|
memset(node, 0, sizeof *node);
|
|
}
|
|
|
|
static textwindows void reflock_init(struct RefLock *reflock) {
|
|
reflock->state = 0;
|
|
}
|
|
|
|
static textwindows void ts_tree_node_init(struct TsTreeNode *node) {
|
|
tree_node_init(&node->tree_node);
|
|
reflock_init(&node->reflock);
|
|
}
|
|
|
|
static textwindows void tree__rotate_left(struct Tree *tree,
|
|
struct TreeNode *node) {
|
|
TREE__ROTATE(left, right)
|
|
}
|
|
|
|
static textwindows void tree__rotate_right(struct Tree *tree,
|
|
struct TreeNode *node) {
|
|
TREE__ROTATE(right, left)
|
|
}
|
|
|
|
static textwindows int tree_add(struct Tree *tree, struct TreeNode *node,
|
|
uintptr_t key) {
|
|
struct TreeNode *parent;
|
|
parent = tree->root;
|
|
if (parent) {
|
|
for (;;) {
|
|
if (key < parent->key) {
|
|
TREE__INSERT_OR_DESCEND(left)
|
|
} else if (key > parent->key) {
|
|
TREE__INSERT_OR_DESCEND(right)
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
} else {
|
|
tree->root = node;
|
|
}
|
|
node->key = key;
|
|
node->left = node->right = NULL;
|
|
node->parent = parent;
|
|
node->red = true;
|
|
for (; parent && parent->red; parent = node->parent) {
|
|
if (parent == parent->parent->left) {
|
|
TREE__REBALANCE_AFTER_INSERT(left, right)
|
|
} else {
|
|
TREE__REBALANCE_AFTER_INSERT(right, left)
|
|
}
|
|
}
|
|
tree->root->red = false;
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int ts_tree_add(struct TsTree *ts_tree,
|
|
struct TsTreeNode *node, uintptr_t key) {
|
|
int r;
|
|
AcquireSRWLockExclusive(&ts_tree->lock);
|
|
r = tree_add(&ts_tree->tree, &node->tree_node, key);
|
|
ReleaseSRWLockExclusive(&ts_tree->lock);
|
|
return r;
|
|
}
|
|
|
|
static textwindows void port__free(struct PortState *port) {
|
|
assert(port != NULL);
|
|
free(port);
|
|
}
|
|
|
|
static textwindows struct PortState *port_new(int64_t *iocp_handle_out) {
|
|
struct PortState *port_state;
|
|
int64_t iocp_handle;
|
|
port_state = port__alloc();
|
|
if (!port_state) goto err1;
|
|
iocp_handle = port__create_iocp();
|
|
if (!iocp_handle) goto err2;
|
|
memset(port_state, 0, sizeof *port_state);
|
|
port_state->iocp_handle = iocp_handle;
|
|
tree_init(&port_state->sock_tree);
|
|
queue_init(&port_state->sock_update_queue);
|
|
queue_init(&port_state->sock_deleted_queue);
|
|
queue_init(&port_state->poll_group_queue);
|
|
ts_tree_node_init(&port_state->handle_tree_node);
|
|
InitializeCriticalSection(&port_state->lock);
|
|
*iocp_handle_out = iocp_handle;
|
|
return port_state;
|
|
err2:
|
|
port__free(port_state);
|
|
err1:
|
|
return NULL;
|
|
}
|
|
|
|
static textwindows int sock__cancel_poll(struct SockState *sock_state) {
|
|
assert(sock_state->poll_status == kPollPending);
|
|
if (afd_cancel_poll(poll_group_get_afd_device_handle(sock_state->poll_group),
|
|
&sock_state->io_status_block) < 0) {
|
|
return -1;
|
|
}
|
|
sock_state->poll_status = kPollCancelled;
|
|
sock_state->pending_events = 0;
|
|
return 0;
|
|
}
|
|
|
|
static textwindows void port_cancel_socket_update(
|
|
struct PortState *port_state, struct SockState *sock_state) {
|
|
if (!queue_is_enqueued(sock_state_to_queue_node(sock_state))) return;
|
|
queue_remove(sock_state_to_queue_node(sock_state));
|
|
}
|
|
|
|
static textwindows struct TreeNode *tree_find(const struct Tree *tree,
|
|
uintptr_t key) {
|
|
struct TreeNode *node = tree->root;
|
|
while (node) {
|
|
if (key < node->key) {
|
|
node = node->left;
|
|
} else if (key > node->key) {
|
|
node = node->right;
|
|
} else {
|
|
return node;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static textwindows struct TsTreeNode *ts_tree__find_node(struct TsTree *ts_tree,
|
|
uintptr_t key) {
|
|
struct TreeNode *tree_node = tree_find(&ts_tree->tree, key);
|
|
if (!tree_node) return NULL;
|
|
return CONTAINOF(tree_node, struct TsTreeNode, tree_node);
|
|
}
|
|
|
|
static textwindows void tree_del(struct Tree *tree, struct TreeNode *node) {
|
|
bool red;
|
|
struct TreeNode *parent, *left, *right, *next;
|
|
parent = node->parent;
|
|
left = node->left;
|
|
right = node->right;
|
|
if (!left) {
|
|
next = right;
|
|
} else if (!right) {
|
|
next = left;
|
|
} else {
|
|
next = right;
|
|
while (next->left) next = next->left;
|
|
}
|
|
if (parent) {
|
|
if (parent->left == node) {
|
|
parent->left = next;
|
|
} else {
|
|
parent->right = next;
|
|
}
|
|
} else {
|
|
tree->root = next;
|
|
}
|
|
if (left && right) {
|
|
red = next->red;
|
|
next->red = node->red;
|
|
next->left = left;
|
|
left->parent = next;
|
|
if (next != right) {
|
|
parent = next->parent;
|
|
next->parent = node->parent;
|
|
node = next->right;
|
|
parent->left = node;
|
|
next->right = right;
|
|
right->parent = next;
|
|
} else {
|
|
next->parent = parent;
|
|
parent = next;
|
|
node = next->right;
|
|
}
|
|
} else {
|
|
red = node->red;
|
|
node = next;
|
|
}
|
|
if (node) node->parent = parent;
|
|
if (red) return;
|
|
if (node && node->red) {
|
|
node->red = false;
|
|
return;
|
|
}
|
|
do {
|
|
if (node == tree->root) break;
|
|
if (node == parent->left) {
|
|
TREE__REBALANCE_AFTER_REMOVE(left, right)
|
|
} else {
|
|
TREE__REBALANCE_AFTER_REMOVE(right, left)
|
|
}
|
|
node = parent;
|
|
parent = parent->parent;
|
|
} while (!node->red);
|
|
if (node) node->red = false;
|
|
}
|
|
|
|
static textwindows void reflock__signal_event(void *address) {
|
|
NtStatus status =
|
|
NtReleaseKeyedEvent(reflock__keyed_event, address, false, NULL);
|
|
if (status != kNtStatusSuccess) abort();
|
|
}
|
|
|
|
static textwindows void reflock__await_event(void *address) {
|
|
NtStatus status =
|
|
NtWaitForKeyedEvent(reflock__keyed_event, address, false, NULL);
|
|
if (status != kNtStatusSuccess) abort();
|
|
}
|
|
|
|
static textwindows void reflock_ref(struct RefLock *reflock) {
|
|
long state = InterlockedAdd(&reflock->state, REFLOCK__REF);
|
|
/* Verify that the counter didn 't overflow and the lock isn' t destroyed.*/
|
|
assert((state & REFLOCK__DESTROY_MASK) == 0);
|
|
}
|
|
|
|
static textwindows void reflock_unref(struct RefLock *reflock) {
|
|
long state = InterlockedAdd(&reflock->state, -REFLOCK__REF);
|
|
/* Verify that the lock was referenced and not already destroyed.*/
|
|
assert((state & REFLOCK__DESTROY_MASK & ~REFLOCK__DESTROY) == 0);
|
|
if (state == REFLOCK__DESTROY) reflock__signal_event(reflock);
|
|
}
|
|
|
|
static textwindows struct TsTreeNode *ts_tree_del_and_ref(
|
|
struct TsTree *ts_tree, uintptr_t key) {
|
|
struct TsTreeNode *ts_tree_node;
|
|
AcquireSRWLockExclusive(&ts_tree->lock);
|
|
ts_tree_node = ts_tree__find_node(ts_tree, key);
|
|
if (ts_tree_node != NULL) {
|
|
tree_del(&ts_tree->tree, &ts_tree_node->tree_node);
|
|
reflock_ref(&ts_tree_node->reflock);
|
|
}
|
|
ReleaseSRWLockExclusive(&ts_tree->lock);
|
|
return ts_tree_node;
|
|
}
|
|
|
|
static textwindows struct TsTreeNode *ts_tree_find_and_ref(
|
|
struct TsTree *ts_tree, uintptr_t key) {
|
|
struct TsTreeNode *ts_tree_node;
|
|
AcquireSRWLockShared(&ts_tree->lock);
|
|
ts_tree_node = ts_tree__find_node(ts_tree, key);
|
|
if (ts_tree_node != NULL) reflock_ref(&ts_tree_node->reflock);
|
|
ReleaseSRWLockShared(&ts_tree->lock);
|
|
return ts_tree_node;
|
|
}
|
|
|
|
static textwindows void ts_tree_node_unref(struct TsTreeNode *node) {
|
|
reflock_unref(&node->reflock);
|
|
}
|
|
|
|
static textwindows void reflock_unref_and_destroy(struct RefLock *reflock) {
|
|
long state, ref_count;
|
|
state = InterlockedAdd(&reflock->state, REFLOCK__DESTROY - REFLOCK__REF);
|
|
ref_count = state & REFLOCK__REF_MASK;
|
|
/* Verify that the lock was referenced and not already destroyed. */
|
|
assert((state & REFLOCK__DESTROY_MASK) == REFLOCK__DESTROY);
|
|
if (ref_count != 0) reflock__await_event(reflock);
|
|
state = InterlockedExchange(&reflock->state, REFLOCK__POISON);
|
|
assert(state == REFLOCK__DESTROY);
|
|
}
|
|
|
|
static textwindows void ts_tree_node_unref_and_destroy(
|
|
struct TsTreeNode *node) {
|
|
reflock_unref_and_destroy(&node->reflock);
|
|
}
|
|
|
|
static textwindows void port_unregister_socket(struct PortState *port_state,
|
|
struct SockState *sock_state) {
|
|
tree_del(&port_state->sock_tree, sock_state_to_tree_node(sock_state));
|
|
}
|
|
|
|
static textwindows void port_remove_deleted_socket(
|
|
struct PortState *port_state, struct SockState *sock_state) {
|
|
if (!queue_is_enqueued(sock_state_to_queue_node(sock_state))) return;
|
|
queue_remove(sock_state_to_queue_node(sock_state));
|
|
}
|
|
|
|
static textwindows struct Queue *port_get_poll_group_queue(
|
|
struct PortState *port_state) {
|
|
return &port_state->poll_group_queue;
|
|
}
|
|
|
|
static textwindows void poll_group_release(struct PollGroup *poll_group) {
|
|
struct PortState *port_state = poll_group->port_state;
|
|
struct Queue *poll_group_queue = port_get_poll_group_queue(port_state);
|
|
poll_group->group_size--;
|
|
assert(poll_group->group_size < MAX_GROUP_SIZE);
|
|
queue_move_to_end(poll_group_queue, &poll_group->queue_node);
|
|
/* Poll groups are currently only freed when the epoll port is closed. */
|
|
}
|
|
|
|
static textwindows void sock__free(struct SockState *sock_state) {
|
|
assert(sock_state != NULL);
|
|
free(sock_state);
|
|
}
|
|
|
|
static textwindows void port_add_deleted_socket(struct PortState *port_state,
|
|
struct SockState *sock_state) {
|
|
if (queue_is_enqueued(sock_state_to_queue_node(sock_state))) return;
|
|
queue_append(&port_state->sock_deleted_queue,
|
|
sock_state_to_queue_node(sock_state));
|
|
}
|
|
|
|
static textwindows int sock__delete(struct PortState *port_state,
|
|
struct SockState *sock_state, bool force) {
|
|
if (!sock_state->delete_pending) {
|
|
if (sock_state->poll_status == kPollPending) {
|
|
sock__cancel_poll(sock_state);
|
|
}
|
|
port_cancel_socket_update(port_state, sock_state);
|
|
port_unregister_socket(port_state, sock_state);
|
|
sock_state->delete_pending = true;
|
|
}
|
|
/* If the poll request still needs to complete, the sock_state object
|
|
can't be free'd yet. `sock_feed_event()` or `port_close()` will
|
|
take care of this later. */
|
|
if (force || sock_state->poll_status == kPollIdle) {
|
|
port_remove_deleted_socket(port_state, sock_state);
|
|
poll_group_release(sock_state->poll_group);
|
|
sock__free(sock_state);
|
|
} else {
|
|
/* Free the socket later.*/
|
|
port_add_deleted_socket(port_state, sock_state);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static textwindows void sock_delete(struct PortState *port_state,
|
|
struct SockState *sock_state) {
|
|
sock__delete(port_state, sock_state, false);
|
|
}
|
|
|
|
static textwindows void sock_force_delete(struct PortState *port_state,
|
|
struct SockState *sock_state) {
|
|
sock__delete(port_state, sock_state, true);
|
|
}
|
|
|
|
static textwindows void poll_group_delete(struct PollGroup *poll_group) {
|
|
assert(poll_group->group_size == 0);
|
|
CloseHandle(poll_group->afd_device_handle);
|
|
queue_remove(&poll_group->queue_node);
|
|
free(poll_group);
|
|
}
|
|
|
|
static textwindows int port_delete(struct PortState *port_state) {
|
|
struct TreeNode *tree_node;
|
|
struct QueueNode *queue_node;
|
|
struct SockState *sock_state;
|
|
struct PollGroup *poll_group;
|
|
/* At this point the IOCP port should have been closed.*/
|
|
assert(!port_state->iocp_handle);
|
|
while ((tree_node = tree_root(&port_state->sock_tree)) != NULL) {
|
|
sock_state = sock_state_from_tree_node(tree_node);
|
|
sock_force_delete(port_state, sock_state);
|
|
}
|
|
while ((queue_node = queue_first(&port_state->sock_deleted_queue)) != NULL) {
|
|
sock_state = sock_state_from_queue_node(queue_node);
|
|
sock_force_delete(port_state, sock_state);
|
|
}
|
|
while ((queue_node = queue_first(&port_state->poll_group_queue)) != NULL) {
|
|
poll_group = poll_group_from_queue_node(queue_node);
|
|
poll_group_delete(poll_group);
|
|
}
|
|
assert(queue_is_empty(&port_state->sock_update_queue));
|
|
DeleteCriticalSection(&port_state->lock);
|
|
port__free(port_state);
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int64_t port_get_iocp_handle(struct PortState *port_state) {
|
|
assert(port_state->iocp_handle);
|
|
return port_state->iocp_handle;
|
|
}
|
|
|
|
static textwindows struct PollGroup *poll_group__new(
|
|
struct PortState *port_state) {
|
|
int64_t iocp_handle = port_get_iocp_handle(port_state);
|
|
struct Queue *poll_group_queue = port_get_poll_group_queue(port_state);
|
|
struct PollGroup *poll_group = malloc(sizeof *poll_group);
|
|
if (!poll_group) RETURN_SET_ERROR(NULL, kNtErrorNotEnoughMemory);
|
|
memset(poll_group, 0, sizeof *poll_group);
|
|
queue_node_init(&poll_group->queue_node);
|
|
poll_group->port_state = port_state;
|
|
if (afd_create_device_handle(iocp_handle, &poll_group->afd_device_handle) <
|
|
0) {
|
|
free(poll_group);
|
|
return NULL;
|
|
}
|
|
queue_append(poll_group_queue, &poll_group->queue_node);
|
|
return poll_group;
|
|
}
|
|
|
|
static textwindows struct PollGroup *poll_group_acquire(
|
|
struct PortState *port_state) {
|
|
struct Queue *poll_group_queue = port_get_poll_group_queue(port_state);
|
|
struct PollGroup *poll_group = !queue_is_empty(poll_group_queue)
|
|
? CONTAINOF(queue_last(poll_group_queue),
|
|
struct PollGroup, queue_node)
|
|
: NULL;
|
|
if (!poll_group || poll_group->group_size >= MAX_GROUP_SIZE)
|
|
poll_group = poll_group__new(port_state);
|
|
if (!poll_group) return NULL;
|
|
if (++poll_group->group_size == MAX_GROUP_SIZE)
|
|
queue_move_to_start(poll_group_queue, &poll_group->queue_node);
|
|
return poll_group;
|
|
}
|
|
|
|
static textwindows int port_close(struct PortState *port_state) {
|
|
int result;
|
|
EnterCriticalSection(&port_state->lock);
|
|
result = port__close_iocp(port_state);
|
|
LeaveCriticalSection(&port_state->lock);
|
|
return result;
|
|
}
|
|
|
|
static textwindows uint32_t sock__epoll_events_to_afd_events(uint32_t e) {
|
|
/* Always monitor for kNtAfdPollLocalClose, which is triggered when
|
|
the socket is closed with closesocket() or CloseHandle(). */
|
|
uint32_t a = kNtAfdPollLocalClose;
|
|
if (e & (EPOLLIN | EPOLLRDNORM)) a |= kNtAfdPollReceive | kNtAfdPollAccept;
|
|
if (e & (EPOLLPRI | EPOLLRDBAND)) a |= kNtAfdPollReceiveExpedited;
|
|
if (e & (EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND)) a |= kNtAfdPollSend;
|
|
if (e & (EPOLLIN | EPOLLRDNORM | EPOLLRDHUP)) a |= kNtAfdPollDisconnect;
|
|
if (e & EPOLLHUP) a |= kNtAfdPollAbort;
|
|
if (e & EPOLLERR) a |= kNtAfdPollConnectFail;
|
|
return a;
|
|
}
|
|
|
|
static textwindows uint32_t sock__afd_events_to_epoll_events(uint32_t a) {
|
|
uint32_t e = 0;
|
|
if (a & (kNtAfdPollReceive | kNtAfdPollAccept)) e |= EPOLLIN | EPOLLRDNORM;
|
|
if (a & kNtAfdPollReceiveExpedited) e |= EPOLLPRI | EPOLLRDBAND;
|
|
if (a & kNtAfdPollSend) e |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
|
|
if (a & kNtAfdPollDisconnect) e |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
|
if (a & kNtAfdPollAbort) e |= EPOLLHUP;
|
|
if (a & kNtAfdPollConnectFail) {
|
|
/* Linux reports all these events after connect() has failed. */
|
|
e |= EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLRDNORM | EPOLLWRNORM | EPOLLRDHUP;
|
|
}
|
|
return e;
|
|
}
|
|
|
|
static textwindows int sock_update(struct PortState *port_state,
|
|
struct SockState *sock_state) {
|
|
assert(!sock_state->delete_pending);
|
|
if ((sock_state->poll_status == kPollPending) &&
|
|
!(sock_state->user_events & KNOWN_EVENTS & ~sock_state->pending_events)) {
|
|
/* All the events the user is interested in are already being
|
|
monitored by the pending poll operation. It might spuriously
|
|
complete because of an event that we're no longer interested in;
|
|
when that happens we'll submit a new poll operation with the
|
|
updated event mask. */
|
|
} else if (sock_state->poll_status == kPollPending) {
|
|
/* A poll operation is already pending, but it's not monitoring for
|
|
all the *events that the user is interested in .Therefore, cancel
|
|
the pending *poll operation; when we receive it's completion
|
|
package, a new poll *operation will be submitted with the correct
|
|
event mask. */
|
|
if (sock__cancel_poll(sock_state) < 0) return -1;
|
|
} else if (sock_state->poll_status == kPollCancelled) {
|
|
/* The poll operation has already been cancelled, we're still waiting for
|
|
it to return.For now, there' s nothing that needs to be done. */
|
|
} else if (sock_state->poll_status == kPollIdle) {
|
|
/* No poll operation is pending; start one. */
|
|
sock_state->poll_info.Exclusive = false;
|
|
sock_state->poll_info.NumberOfHandles = 1;
|
|
sock_state->poll_info.Timeout = INT64_MAX;
|
|
sock_state->poll_info.Handles[0].Handle = (int64_t)sock_state->base_socket;
|
|
sock_state->poll_info.Handles[0].Status = 0;
|
|
sock_state->poll_info.Handles[0].Events =
|
|
sock__epoll_events_to_afd_events(sock_state->user_events);
|
|
if (afd_poll(poll_group_get_afd_device_handle(sock_state->poll_group),
|
|
&sock_state->poll_info, &sock_state->io_status_block) < 0) {
|
|
switch (GetLastError()) {
|
|
case kNtErrorIoPending:
|
|
/* Overlapped poll operation in progress; this is expected. */
|
|
break;
|
|
case kNtErrorInvalidHandle:
|
|
/* Socket closed; it'll be dropped from the epoll set. */
|
|
return sock__delete(port_state, sock_state, false);
|
|
default:
|
|
/* Other errors are propagated to the caller. */
|
|
RETURN_MAP_ERROR(-1);
|
|
}
|
|
}
|
|
/* The poll request was successfully submitted.*/
|
|
sock_state->poll_status = kPollPending;
|
|
sock_state->pending_events = sock_state->user_events;
|
|
} else {
|
|
unreachable;
|
|
}
|
|
port_cancel_socket_update(port_state, sock_state);
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int port__update_events(struct PortState *port_state) {
|
|
struct QueueNode *queue_node;
|
|
struct SockState *sock_state;
|
|
struct Queue *sock_update_queue = &port_state->sock_update_queue;
|
|
/* Walk queue, submitting new poll requests for sockets needing it */
|
|
while (!queue_is_empty(sock_update_queue)) {
|
|
queue_node = queue_first(sock_update_queue);
|
|
sock_state = sock_state_from_queue_node(queue_node);
|
|
if (sock_update(port_state, sock_state) < 0) return -1;
|
|
/* sock_update() removes the socket from the update queue.*/
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static textwindows void port__update_events_if_polling(
|
|
struct PortState *port_state) {
|
|
if (port_state->active_poll_count > 0) port__update_events(port_state);
|
|
}
|
|
|
|
static textwindows void port_request_socket_update(
|
|
struct PortState *port_state, struct SockState *sock_state) {
|
|
if (queue_is_enqueued(sock_state_to_queue_node(sock_state))) return;
|
|
queue_append(&port_state->sock_update_queue,
|
|
sock_state_to_queue_node(sock_state));
|
|
}
|
|
|
|
static textwindows int sock_feed_event(struct PortState *port_state,
|
|
struct NtIoStatusBlock *io_status_block,
|
|
struct epoll_event *ev) {
|
|
uint32_t epoll_events;
|
|
struct SockState *sock_state;
|
|
struct NtAfdPollInfo *poll_info;
|
|
epoll_events = 0;
|
|
sock_state = CONTAINOF(io_status_block, struct SockState, io_status_block);
|
|
poll_info = &sock_state->poll_info;
|
|
sock_state->poll_status = kPollIdle;
|
|
sock_state->pending_events = 0;
|
|
if (sock_state->delete_pending) {
|
|
/* Socket has been deleted earlier and can now be freed.*/
|
|
return sock__delete(port_state, sock_state, false);
|
|
} else if (io_status_block->Status == kNtStatusCancelled) {
|
|
/* The poll request was cancelled by CancelIoEx.*/
|
|
} else if (!NtSuccess(io_status_block->Status)) {
|
|
/* The overlapped request itself failed in an unexpected way.*/
|
|
epoll_events = EPOLLERR;
|
|
} else if (poll_info->NumberOfHandles < 1) {
|
|
/* This poll operation succeeded but didn't report any socket events. */
|
|
} else if (poll_info->Handles[0].Events & kNtAfdPollLocalClose) {
|
|
/* The poll operation reported that the socket was closed.*/
|
|
return sock__delete(port_state, sock_state, false);
|
|
} else {
|
|
/* Events related to our socket were reported.*/
|
|
epoll_events =
|
|
sock__afd_events_to_epoll_events(poll_info->Handles[0].Events);
|
|
}
|
|
/* Requeue the socket so a new poll request will be submitted.*/
|
|
port_request_socket_update(port_state, sock_state);
|
|
/* Filter out events that the user didn't ask for. */
|
|
epoll_events &= sock_state->user_events;
|
|
/* Return if there are no epoll events to report.*/
|
|
if (epoll_events == 0) return 0;
|
|
/* If the the socket has the EPOLLONESHOT flag set, unmonitor all
|
|
events, even EPOLLERR and EPOLLHUP. But always keep looking for
|
|
closed sockets. */
|
|
if (sock_state->user_events & EPOLLONESHOT) {
|
|
sock_state->user_events = 0;
|
|
}
|
|
ev->data = sock_state->user_data;
|
|
ev->events = epoll_events;
|
|
return 1;
|
|
}
|
|
|
|
static textwindows int port__feed_events(struct PortState *port_state,
|
|
struct epoll_event *epoll_events,
|
|
struct NtOverlappedEntry *iocp_events,
|
|
uint32_t iocp_event_count) {
|
|
uint32_t i;
|
|
int epoll_event_count;
|
|
struct epoll_event *ev;
|
|
struct NtIoStatusBlock *io_status_block;
|
|
epoll_event_count = 0;
|
|
for (i = 0; i < iocp_event_count; i++) {
|
|
io_status_block = (struct NtIoStatusBlock *)iocp_events[i].lpOverlapped;
|
|
ev = &epoll_events[epoll_event_count];
|
|
epoll_event_count += sock_feed_event(port_state, io_status_block, ev);
|
|
}
|
|
return epoll_event_count;
|
|
}
|
|
|
|
static textwindows int port__poll(struct PortState *port_state,
|
|
struct epoll_event *epoll_events,
|
|
struct NtOverlappedEntry *iocp_events,
|
|
uint32_t maxevents, uint32_t timeout) {
|
|
bool32 r;
|
|
uint32_t completion_count;
|
|
if (port__update_events(port_state) < 0) return -1;
|
|
port_state->active_poll_count++;
|
|
LeaveCriticalSection(&port_state->lock);
|
|
r = GetQueuedCompletionStatusEx(port_state->iocp_handle, iocp_events,
|
|
maxevents, &completion_count, timeout, false);
|
|
EnterCriticalSection(&port_state->lock);
|
|
port_state->active_poll_count--;
|
|
if (!r) RETURN_MAP_ERROR(-1);
|
|
return port__feed_events(port_state, epoll_events, iocp_events,
|
|
completion_count);
|
|
}
|
|
|
|
static textwindows int port_wait(struct PortState *port_state,
|
|
struct epoll_event *events, int maxevents,
|
|
int timeout) {
|
|
int result;
|
|
uint64_t now, due = 0;
|
|
uint32_t gqcs_timeout;
|
|
struct NtOverlappedEntry *iocp_events;
|
|
struct NtOverlappedEntry stack_iocp_events[64];
|
|
/* Check whether `maxevents` is in range.*/
|
|
if (maxevents <= 0) RETURN_SET_ERROR(-1, kNtErrorInvalidParameter);
|
|
/* Decide whether the IOCP completion list can live on the stack, or
|
|
allocate memory for it on the heap. */
|
|
if ((size_t)maxevents <= ARRAYLEN(stack_iocp_events)) {
|
|
iocp_events = stack_iocp_events;
|
|
} else if ((iocp_events = malloc((size_t)maxevents * sizeof(*iocp_events))) ==
|
|
NULL) {
|
|
iocp_events = stack_iocp_events;
|
|
maxevents = ARRAYLEN(stack_iocp_events);
|
|
}
|
|
/* Compute the timeout for GetQueuedCompletionStatus, and the wait end
|
|
time, if the user specified a timeout other than zero or infinite. */
|
|
if (timeout > 0) {
|
|
due = GetTickCount64() + (uint64_t)timeout;
|
|
gqcs_timeout = (uint32_t)timeout;
|
|
} else if (timeout == 0) {
|
|
gqcs_timeout = 0;
|
|
} else {
|
|
gqcs_timeout = -1;
|
|
}
|
|
EnterCriticalSection(&port_state->lock);
|
|
/* Dequeue completion packets until either at least one interesting
|
|
event has been discovered, or the timeout is reached. */
|
|
for (;;) {
|
|
result = port__poll(port_state, events, iocp_events, (uint32_t)maxevents,
|
|
gqcs_timeout);
|
|
if (result < 0 || result > 0) break;
|
|
/* Result, error, or time - out. */
|
|
if (timeout < 0) continue;
|
|
/* When timeout is negative, never time out. */
|
|
/* Update time. */
|
|
now = GetTickCount64();
|
|
/* Do not allow the due time to be in the past. */
|
|
if (now >= due) {
|
|
SetLastError(kNtWaitTimeout);
|
|
break;
|
|
}
|
|
/* Recompute time-out argument for GetQueuedCompletionStatus. */
|
|
gqcs_timeout = (uint32_t)(due - now);
|
|
}
|
|
port__update_events_if_polling(port_state);
|
|
LeaveCriticalSection(&port_state->lock);
|
|
if (iocp_events != stack_iocp_events) {
|
|
free(iocp_events);
|
|
}
|
|
if (result >= 0) {
|
|
return result;
|
|
} else if (GetLastError() == kNtWaitTimeout) {
|
|
return 0;
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
static textwindows int64_t ws__ioctl_get_bsp_socket(int64_t socket,
|
|
uint32_t ioctl) {
|
|
uint32_t bytes;
|
|
int64_t bsp_socket;
|
|
if (WSAIoctl(socket, ioctl, NULL, 0, &bsp_socket, sizeof(bsp_socket), &bytes,
|
|
NULL, NULL) != -1) {
|
|
return bsp_socket;
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
static textwindows int64_t ws_get_base_socket(int64_t socket) {
|
|
uint32_t error;
|
|
int64_t base_socket;
|
|
for (;;) {
|
|
base_socket = ws__ioctl_get_bsp_socket(socket, kNtSioBaseHandle);
|
|
if (base_socket != -1) {
|
|
return base_socket;
|
|
}
|
|
error = GetLastError();
|
|
if (error == WSAENOTSOCK) {
|
|
RETURN_SET_ERROR(-1, error);
|
|
}
|
|
/*
|
|
* Even though Microsoft documentation clearly states that Layered
|
|
* Spyware Providers must never ever intercept the SIO_BASE_HANDLE
|
|
* ioctl, Komodia LSPs (that Lenovo got sued for preinstalling) do
|
|
* so anyway in order to redirect decrypted https requests through
|
|
* some foreign proxy and inject ads which breaks high-performance
|
|
* network event io. However it doesn't handle SIO_BSP_HANDLE_POLL
|
|
* which will at least let us obtain the socket associated with the
|
|
* next winsock protocol chain entry. If this succeeds, loop around
|
|
* and call SIO_BASE_HANDLE again with the returned BSP socket, to
|
|
* make sure we unwrap all layers and retrieve the real base socket.
|
|
*/
|
|
base_socket = ws__ioctl_get_bsp_socket(socket, kNtSioBspHandlePoll);
|
|
if (base_socket != -1 && base_socket != socket) {
|
|
socket = base_socket;
|
|
} else {
|
|
RETURN_SET_ERROR(-1, error);
|
|
}
|
|
}
|
|
}
|
|
|
|
static textwindows struct SockState *sock__alloc(void) {
|
|
struct SockState *sock_state = malloc(sizeof *sock_state);
|
|
if (!sock_state) RETURN_SET_ERROR(NULL, kNtErrorNotEnoughMemory);
|
|
return sock_state;
|
|
}
|
|
|
|
static textwindows int port_register_socket(struct PortState *port_state,
|
|
struct SockState *sock_state,
|
|
int64_t socket) {
|
|
if (tree_add(&port_state->sock_tree, sock_state_to_tree_node(sock_state),
|
|
socket) < 0) {
|
|
RETURN_SET_ERROR(-1, kNtErrorAlreadyExists);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static textwindows struct SockState *sock_new(struct PortState *port_state,
|
|
int64_t socket) {
|
|
int64_t base_socket;
|
|
struct PollGroup *poll_group;
|
|
struct SockState *sock_state;
|
|
if (socket == 0 || socket == -1) RETURN_SET_ERROR(0, kNtErrorInvalidHandle);
|
|
base_socket = ws_get_base_socket(socket);
|
|
if (base_socket == -1) return NULL;
|
|
poll_group = poll_group_acquire(port_state);
|
|
if (!poll_group) return NULL;
|
|
sock_state = sock__alloc();
|
|
if (!sock_state) goto err1;
|
|
memset(sock_state, 0, sizeof *sock_state);
|
|
sock_state->base_socket = base_socket;
|
|
sock_state->poll_group = poll_group;
|
|
tree_node_init(&sock_state->tree_node);
|
|
queue_node_init(&sock_state->queue_node);
|
|
if (port_register_socket(port_state, sock_state, socket) < 0) goto err2;
|
|
return sock_state;
|
|
err2:
|
|
sock__free(sock_state);
|
|
err1:
|
|
poll_group_release(poll_group);
|
|
return NULL;
|
|
}
|
|
|
|
static textwindows int sock_set_event(struct PortState *port_state,
|
|
struct SockState *sock_state,
|
|
const struct epoll_event *ev) {
|
|
/* EPOLLERR and EPOLLHUP are always reported, even when not requested
|
|
by the caller. However they are disabled after a event has been
|
|
reported for a socket for which the EPOLLONESHOT flag was set. */
|
|
uint32_t events = ev->events | EPOLLERR | EPOLLHUP;
|
|
sock_state->user_events = events;
|
|
sock_state->user_data = ev->data;
|
|
if ((events & KNOWN_EVENTS & ~sock_state->pending_events) != 0) {
|
|
port_request_socket_update(port_state, sock_state);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int port__ctl_add(struct PortState *port_state, int64_t sock,
|
|
struct epoll_event *ev) {
|
|
struct SockState *sock_state = sock_new(port_state, sock);
|
|
if (!sock_state) return -1;
|
|
if (sock_set_event(port_state, sock_state, ev) < 0) {
|
|
sock_delete(port_state, sock_state);
|
|
return -1;
|
|
}
|
|
port__update_events_if_polling(port_state);
|
|
return 0;
|
|
}
|
|
|
|
static textwindows struct SockState *port_find_socket(
|
|
struct PortState *port_state, int64_t socket) {
|
|
struct TreeNode *tree_node = tree_find(&port_state->sock_tree, socket);
|
|
if (!tree_node) RETURN_SET_ERROR(NULL, kNtErrorNotFound);
|
|
return sock_state_from_tree_node(tree_node);
|
|
}
|
|
|
|
static textwindows int port__ctl_mod(struct PortState *port_state, int64_t sock,
|
|
struct epoll_event *ev) {
|
|
struct SockState *sock_state = port_find_socket(port_state, sock);
|
|
if (!sock_state) return -1;
|
|
if (sock_set_event(port_state, sock_state, ev) < 0) return -1;
|
|
port__update_events_if_polling(port_state);
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int port__ctl_del(struct PortState *port_state,
|
|
int64_t sock) {
|
|
struct SockState *sock_state = port_find_socket(port_state, sock);
|
|
if (!sock_state) return -1;
|
|
sock_delete(port_state, sock_state);
|
|
return 0;
|
|
}
|
|
|
|
static textwindows int port__ctl_op(struct PortState *port_state, int op,
|
|
int64_t sock, struct epoll_event *ev) {
|
|
switch (op) {
|
|
case EPOLL_CTL_ADD:
|
|
return port__ctl_add(port_state, sock, ev);
|
|
case EPOLL_CTL_MOD:
|
|
return port__ctl_mod(port_state, sock, ev);
|
|
case EPOLL_CTL_DEL:
|
|
return port__ctl_del(port_state, sock);
|
|
default:
|
|
RETURN_SET_ERROR(-1, kNtErrorInvalidParameter);
|
|
}
|
|
}
|
|
|
|
static textwindows int port_ctl(struct PortState *port_state, int op,
|
|
int64_t sock, struct epoll_event *ev) {
|
|
int result;
|
|
EnterCriticalSection(&port_state->lock);
|
|
result = port__ctl_op(port_state, op, sock, ev);
|
|
LeaveCriticalSection(&port_state->lock);
|
|
return result;
|
|
}
|
|
|
|
static textwindows struct PortState *port_state_from_handle_tree_node(
|
|
struct TsTreeNode *tree_node) {
|
|
return CONTAINOF(tree_node, struct PortState, handle_tree_node);
|
|
}
|
|
|
|
static textwindows noinline int epoll_create1$nt(uint32_t flags) {
|
|
int fd;
|
|
int64_t ephnd;
|
|
struct PortState *port_state;
|
|
struct TsTreeNode *tree_node;
|
|
if ((fd = __getemptyfd()) == -1) return -1;
|
|
if (wepoll_init() < 0) return -1;
|
|
port_state = port_new(&ephnd);
|
|
if (!port_state) return -1;
|
|
tree_node = port_state_to_handle_tree_node(port_state);
|
|
if (ts_tree_add(&epoll__handle_tree, tree_node, (uintptr_t)ephnd) < 0) {
|
|
/* This should never happen. */
|
|
port_delete(port_state);
|
|
RETURN_SET_ERROR(-1, kNtErrorAlreadyExists);
|
|
}
|
|
g_fds.p[fd].kind = kFdEpoll;
|
|
g_fds.p[fd].handle = ephnd;
|
|
g_fds.p[fd].flags = flags;
|
|
return fd;
|
|
}
|
|
|
|
static textwindows noinline int epoll_ctl$nt(int epfd, int op, int fd,
|
|
struct epoll_event *ev) {
|
|
int r;
|
|
struct PortState *port_state;
|
|
struct TsTreeNode *tree_node;
|
|
if (!IsWindows()) {
|
|
return epoll_ctl$sysv(epfd, op, fd, ev);
|
|
} else {
|
|
if (wepoll_init() < 0) return -1;
|
|
if (!__isfdopen(fd)) return ebadf();
|
|
if (!__isfdkind(epfd, kFdEpoll)) return ebadf();
|
|
tree_node = ts_tree_find_and_ref(&epoll__handle_tree, g_fds.p[epfd].handle);
|
|
if (!tree_node) {
|
|
err_set_win_error(kNtErrorInvalidParameter);
|
|
goto err;
|
|
}
|
|
port_state = port_state_from_handle_tree_node(tree_node);
|
|
r = port_ctl(port_state, op, g_fds.p[fd].handle, ev);
|
|
ts_tree_node_unref(tree_node);
|
|
if (r < 0) goto err;
|
|
return 0;
|
|
err:
|
|
/* On Linux, in the case of epoll_ctl(), EBADF takes priority over
|
|
other *errors. Wepoll mimics this behavior. */
|
|
err_check_handle(g_fds.p[epfd].handle);
|
|
err_check_handle(g_fds.p[fd].handle);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
static textwindows noinline int epoll_wait$nt(int epfd,
|
|
struct epoll_event *events,
|
|
int maxevents, int timeoutms) {
|
|
int num_events;
|
|
struct PortState *port_state;
|
|
struct TsTreeNode *tree_node;
|
|
if (!__isfdkind(epfd, kFdEpoll)) return ebadf();
|
|
if (maxevents <= 0) return einval();
|
|
if (wepoll_init() < 0) return -1;
|
|
tree_node = ts_tree_find_and_ref(&epoll__handle_tree, g_fds.p[epfd].handle);
|
|
if (!tree_node) {
|
|
err_set_win_error(kNtErrorInvalidParameter);
|
|
goto err;
|
|
}
|
|
port_state = port_state_from_handle_tree_node(tree_node);
|
|
num_events = port_wait(port_state, events, maxevents, timeoutms);
|
|
ts_tree_node_unref(tree_node);
|
|
if (num_events < 0) goto err;
|
|
return num_events;
|
|
err:
|
|
err_check_handle(g_fds.p[epfd].handle);
|
|
return -1;
|
|
}
|
|
|
|
static textwindows noinline int close$epoll$nt(int fd) {
|
|
struct PortState *port_state;
|
|
struct TsTreeNode *tree_node;
|
|
if (wepoll_init() < 0) return -1;
|
|
tree_node = ts_tree_del_and_ref(&epoll__handle_tree, g_fds.p[fd].handle);
|
|
if (!tree_node) {
|
|
err_set_win_error(kNtErrorInvalidParameter);
|
|
goto err;
|
|
}
|
|
port_state = port_state_from_handle_tree_node(tree_node);
|
|
port_close(port_state);
|
|
ts_tree_node_unref_and_destroy(tree_node);
|
|
return port_delete(port_state);
|
|
err:
|
|
err_check_handle(g_fds.p[fd].handle);
|
|
return -1;
|
|
}
|
|
|
|
int close$epoll(int fd) {
|
|
if (!IsWindows()) {
|
|
return close$sysv(fd);
|
|
} else {
|
|
return close$epoll$nt(fd);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Creates new epoll instance.
|
|
*
|
|
* @param size is ignored but must be greater than zero
|
|
* @param flags must be zero as there are no supported flags
|
|
* @return epoll file descriptor, or -1 on failure
|
|
*/
|
|
int epoll_create(int size) {
|
|
if (size <= 0) return einval();
|
|
return epoll_create1(0);
|
|
}
|
|
|
|
/**
|
|
* Creates new epoll instance.
|
|
*
|
|
* @param size is ignored but must be greater than zero
|
|
* @param flags must be zero or can have O_CLOEXEC
|
|
* @return epoll file descriptor, or -1 on failure
|
|
*/
|
|
int epoll_create1(int flags) {
|
|
int fd;
|
|
if (flags & ~O_CLOEXEC) return einval();
|
|
if (!IsWindows()) {
|
|
return __ensurefds(fixupnewfd$sysv(epoll_create$sysv(1337), flags));
|
|
} else {
|
|
return epoll_create1$nt(flags);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Controls which socket events are monitored.
|
|
*
|
|
* It is recommended to always explicitly remove a socket from its epoll
|
|
* set using EPOLL_CTL_DEL before closing it. As on Linux, your closed
|
|
* sockets are automatically removed from the epoll set, but wepoll may
|
|
* not be able to detect that a socket was closed until the next call to
|
|
* epoll_wait().
|
|
*
|
|
* @param epfd is file descriptor created by epoll_create()
|
|
* @param op can be EPOLL_CTL_{ADD,MOD,DEL}
|
|
* @param fd is file descriptor to monitor
|
|
* @param ev is ignored if op is EPOLL_CTL_DEL
|
|
* @param ev->events can have these flags:
|
|
* - EPOLLIN: trigger on fd readable
|
|
* - EPOLLOUT: trigger on fd writeable
|
|
* - EPOLLERR: trigger on fd error (superfluous: always reported)
|
|
* - EPOLLHUP: trigger on fd remote hangup (superfluous: always reported)
|
|
* - EPOLLPRI: trigger on fd exceptional conditions, e.g. oob
|
|
* - EPOLLONESHOT: report event(s) only once
|
|
* - EPOLLEXCLUSIVE: not supported on windows
|
|
* - EPOLLWAKEUP: not supported on windows
|
|
* - EPOLLET: edge triggered mode (not supported on windows)
|
|
* - EPOLLRDNORM
|
|
* - EPOLLRDBAND
|
|
* - EPOLLWRNORM
|
|
* - EPOLLWRBAND
|
|
* - EPOLLRDHUP
|
|
* - EPOLLMSG
|
|
* @error ENOTSOCK on Windows if fd isn't a socket :(
|
|
* @return 0 on success, or -1 w/ errno
|
|
*/
|
|
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev) {
|
|
if (!IsWindows()) {
|
|
return epoll_ctl$sysv(epfd, op, fd, ev);
|
|
} else {
|
|
return epoll_ctl$nt(epfd, op, fd, ev);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Receives socket events.
|
|
*
|
|
* @param events will receive information about what happened
|
|
* @param maxevents is array length of events
|
|
* @param timeoutms is milliseconds, 0 to not block, or -1 for forever
|
|
* @return number of events stored, 0 on timeout, or -1 w/ errno
|
|
*/
|
|
int epoll_wait(int epfd, struct epoll_event *events, int maxevents,
|
|
int timeoutms) {
|
|
if (!IsWindows()) {
|
|
return epoll_wait$sysv(epfd, events, maxevents, timeoutms);
|
|
} else {
|
|
return epoll_wait$nt(epfd, events, maxevents, timeoutms);
|
|
}
|
|
}
|