Fundamentals 24 min read

Unlocking Linux IO Multiplexing: How select, poll, and epoll Really Work

This article explains the Linux kernel's wait‑queue mechanism, the sleep and wake‑up logic behind socket events, and how the IO‑multiplexing models select, poll, and epoll are implemented, including their core code paths, performance trade‑offs, and edge versus level triggering.

Xiaokun's Architecture Exploration Notes
Xiaokun's Architecture Exploration Notes
Xiaokun's Architecture Exploration Notes
Unlocking Linux IO Multiplexing: How select, poll, and epoll Really Work

Linux Kernel Wait‑Queue Mechanism

The kernel uses a double‑linked list called a wait‑queue to suspend a blocked task; when the condition becomes true the task is removed from the queue and placed on the CPU run‑queue for scheduling.

Sleep Logic
<code>#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) ({
    __label__ __out;
    struct wait_queue_entry __wq_entry;
    long __ret = ret;
    init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);
    for (;;) {
        long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);
        if (condition)
            break;
        if (___wait_is_interruptible(state) && __int) {
            __ret = __int;
            goto __out;
        }
        cmd; // schedule()
    }
    finish_wait(&wq_head, &__wq_entry);
    __out: __ret;
})</code>
Wake‑up Logic
<code>static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
            int nr_exclusive, int wake_flags, void *key,
            wait_queue_entry_t *bookmark)
{
    list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
        unsigned flags = curr->flags;
        int ret;
        if (flags & WQ_FLAG_BOOKMARK)
            continue;
        ret = curr->func(curr, mode, wake_flags, key);
        if (ret < 0)
            break;
        if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
        if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
                (&next->entry != &wq_head->head)) {
            bookmark->flags = WQ_FLAG_BOOKMARK;
            list_add_tail(&bookmark->entry, &next->entry);
            break;
        }
    }
    return nr_exclusive;
}

struct wait_queue_entry {
    unsigned int flags;
    void *private;
    wait_queue_func_t func;
    struct list_head entry;
};

int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
    int ret = default_wake_function(wq_entry, mode, sync, key);
    if (ret)
        list_del_init(&wq_entry->entry);
    return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
</code>

IO Multiplexing Model Overview

Before diving into select/poll/epoll, the article reviews the concept of IO multiplexing: a single process handling many socket descriptors, sleeping when none are readable and being woken up when at least one becomes readable.

IO Multiplexing Essence

In communication engineering, multiplexing means transmitting multiple signals over a single channel to improve resource utilization; in the kernel it means a process can monitor N sockets and act only when a socket is ready.

Select Implementation

Select monitors a fixed‑size fd set (max 1024 by default) and copies the set between user and kernel space on each call, leading to high overhead.

<code>int select(int maxfd1, fd_set *readset, fd_set *writeset,
           fd_set *exceptset, const struct timeval *timeout);

int pselect(int maxfd1, fd_set *readset, fd_set *writeset,
            fd_set *exceptset, const struct timespec *timeout,
            const sigset_t *sigmask);
</code>
<code>static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
    struct poll_wqueues table;
    poll_table *wait;
    poll_initwait(&table);
    wait = &table.pt;
    // ... copy fd set from user, prepare poll, loop over fds ...
    for (;;) {
        // check each fd for readiness
        if (condition_met) {
            // update event mask
            fd_sock.event |= POLLIN;
        }
        if (retval || timed_out || signal_pending(current))
            break;
        if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    poll_freewait(&table);
}
</code>
<code>static int __pollwake(wait_queue_entry_t *wait, unsigned mode,
                       int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
    smp_wmb();
    pwq->triggered = 1;
    return default_wake_function(&dummy_wait, mode, sync, key);
}
</code>

Poll Implementation

Poll replaces the fixed array with a linked list, removing the FD_SETSIZE limit but still suffers from large user‑kernel copies and full‑queue traversal.

<code>int poll(struct pollfd *fds, unsigned long nfds, int timeout);
</code>
<code>static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                       struct timespec64 *end_time)
{
    // copy from user, init wait queue
    poll_initwait(&table);
    fdcount = do_poll(head, &table, end_time);
    poll_freewait(&table);
}
</code>

Epoll Implementation

Epoll solves the copy‑overhead and FD limit by creating an epoll instance that stores a single kernel‑side data structure (a red‑black tree) for all registered sockets.

<code>int epoll_create(int size);
int epoll_create1(int flags);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event *events,
               int maxevents, int timeout);
</code>
<code>static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                    struct file *tfile, int fd, int full_check)
{
    struct epitem *epi;
    // initialize epitem, link to epoll instance
    epi->event = *event;
    // add to red‑black tree
    ep_rbtree_insert(ep, epi);
    // if ready, add to ready list and wake up waiters
    if (revents && !ep_is_linked(epi)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        wake_up(&ep->wq);
    }
    return 0;
}
</code>
<code>static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
{
    for (;;) {
        if (ep_events_available(ep))
            break;
        if (signal_pending(current)) {
            res = -EINTR;
            break;
        }
        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
            timed_out = 1;
            break;
        }
    }
    ep_send_events(ep, events, maxevents);
}
</code>

Edge vs Level Triggering

Level‑triggered events fire as long as the socket buffer remains readable/writable, causing repeated wake‑ups. Edge‑triggered events fire only when the buffer state changes, reducing the number of wake‑ups.

Level trigger diagram
Level trigger diagram
Edge trigger diagram
Edge trigger diagram
kernelLinuxepollpollselectIO Multiplexingwait queue
Xiaokun's Architecture Exploration Notes
Written by

Xiaokun's Architecture Exploration Notes

10 years of backend architecture design | AI engineering infrastructure, storage architecture design, and performance optimization | Former senior developer at NetEase, Douyu, Inke, etc.

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.