Unlocking Linux IO Multiplexing: How select, poll, and epoll Really Work
This article explains the Linux kernel's wait‑queue mechanism, the sleep and wake‑up logic behind socket events, and how the IO‑multiplexing models select, poll, and epoll are implemented, including their core code paths, performance trade‑offs, and edge versus level triggering.
Linux Kernel Wait‑Queue Mechanism
The kernel uses a double‑linked list called a wait‑queue to suspend a blocked task; when the condition becomes true the task is removed from the queue and placed on the CPU run‑queue for scheduling.
Sleep Logic
<code>#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) ({
__label__ __out;
struct wait_queue_entry __wq_entry;
long __ret = ret;
init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);
for (;;) {
long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);
if (condition)
break;
if (___wait_is_interruptible(state) && __int) {
__ret = __int;
goto __out;
}
cmd; // schedule()
}
finish_wait(&wq_head, &__wq_entry);
__out: __ret;
})</code>Wake‑up Logic
<code>static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key,
wait_queue_entry_t *bookmark)
{
list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
int ret;
if (flags & WQ_FLAG_BOOKMARK)
continue;
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
(&next->entry != &wq_head->head)) {
bookmark->flags = WQ_FLAG_BOOKMARK;
list_add_tail(&bookmark->entry, &next->entry);
break;
}
}
return nr_exclusive;
}
struct wait_queue_entry {
unsigned int flags;
void *private;
wait_queue_func_t func;
struct list_head entry;
};
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
list_del_init(&wq_entry->entry);
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
</code>IO Multiplexing Model Overview
Before diving into select/poll/epoll, the article reviews the concept of IO multiplexing: a single process handling many socket descriptors, sleeping when none are readable and being woken up when at least one becomes readable.
IO Multiplexing Essence
In communication engineering, multiplexing means transmitting multiple signals over a single channel to improve resource utilization; in the kernel it means a process can monitor N sockets and act only when a socket is ready.
Select Implementation
Select monitors a fixed‑size fd set (max 1024 by default) and copies the set between user and kernel space on each call, leading to high overhead.
<code>int select(int maxfd1, fd_set *readset, fd_set *writeset,
fd_set *exceptset, const struct timeval *timeout);
int pselect(int maxfd1, fd_set *readset, fd_set *writeset,
fd_set *exceptset, const struct timespec *timeout,
const sigset_t *sigmask);
</code> <code>static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
struct poll_wqueues table;
poll_table *wait;
poll_initwait(&table);
wait = &table.pt;
// ... copy fd set from user, prepare poll, loop over fds ...
for (;;) {
// check each fd for readiness
if (condition_met) {
// update event mask
fd_sock.event |= POLLIN;
}
if (retval || timed_out || signal_pending(current))
break;
if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
poll_freewait(&table);
}
</code> <code>static int __pollwake(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct poll_wqueues *pwq = wait->private;
DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
smp_wmb();
pwq->triggered = 1;
return default_wake_function(&dummy_wait, mode, sync, key);
}
</code>Poll Implementation
Poll replaces the fixed array with a linked list, removing the FD_SETSIZE limit but still suffers from large user‑kernel copies and full‑queue traversal.
<code>int poll(struct pollfd *fds, unsigned long nfds, int timeout);
</code> <code>static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
// copy from user, init wait queue
poll_initwait(&table);
fdcount = do_poll(head, &table, end_time);
poll_freewait(&table);
}
</code>Epoll Implementation
Epoll solves the copy‑overhead and FD limit by creating an epoll instance that stores a single kernel‑side data structure (a red‑black tree) for all registered sockets.
<code>int epoll_create(int size);
int epoll_create1(int flags);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event *events,
int maxevents, int timeout);
</code> <code>static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
struct epitem *epi;
// initialize epitem, link to epoll instance
epi->event = *event;
// add to red‑black tree
ep_rbtree_insert(ep, epi);
// if ready, add to ready list and wake up waiters
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
wake_up(&ep->wq);
}
return 0;
}
</code> <code>static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
for (;;) {
if (ep_events_available(ep))
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
timed_out = 1;
break;
}
}
ep_send_events(ep, events, maxevents);
}
</code>Edge vs Level Triggering
Level‑triggered events fire as long as the socket buffer remains readable/writable, causing repeated wake‑ups. Edge‑triggered events fire only when the buffer state changes, reducing the number of wake‑ups.
Xiaokun's Architecture Exploration Notes
10 years of backend architecture design | AI engineering infrastructure, storage architecture design, and performance optimization | Former senior developer at NetEase, Douyu, Inke, etc.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.