Fundamentals 24 min read

Unlocking Linux IO Multiplexing: How select, poll, and epoll Really Work

This article explains the Linux kernel's wait‑queue mechanism, the sleep and wake‑up logic behind socket events, and how the IO‑multiplexing models select, poll, and epoll are implemented, including their core code paths, performance trade‑offs, and edge versus level triggering.

Xiaokun's Architecture Exploration Notes

Mar 10, 2020

Unlocking Linux IO Multiplexing: How select, poll, and epoll Really Work

Linux Kernel Wait‑Queue Mechanism

The kernel uses a double‑linked list called a wait‑queue to suspend a blocked task; when the condition becomes true the task is removed from the queue and placed on the CPU run‑queue for scheduling.

Sleep Logic

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) ({
    __label__ __out;
    struct wait_queue_entry __wq_entry;
    long __ret = ret;
    init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);
    for (;;) {
        long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);
        if (condition)
            break;
        if (___wait_is_interruptible(state) && __int) {
            __ret = __int;
            goto __out;
        }
        cmd; // schedule()
    }
    finish_wait(&wq_head, &__wq_entry);
    __out: __ret;
})

Wake‑up Logic

static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
            int nr_exclusive, int wake_flags, void *key,
            wait_queue_entry_t *bookmark)
{
    list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
        unsigned flags = curr->flags;
        int ret;
        if (flags & WQ_FLAG_BOOKMARK)
            continue;
        ret = curr->func(curr, mode, wake_flags, key);
        if (ret < 0)
            break;
        if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
        if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
                (&next->entry != &wq_head->head)) {
            bookmark->flags = WQ_FLAG_BOOKMARK;
            list_add_tail(&bookmark->entry, &next->entry);
            break;
        }
    }
    return nr_exclusive;
}

struct wait_queue_entry {
    unsigned int flags;
    void *private;
    wait_queue_func_t func;
    struct list_head entry;
};

int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
    int ret = default_wake_function(wq_entry, mode, sync, key);
    if (ret)
        list_del_init(&wq_entry->entry);
    return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);

IO Multiplexing Model Overview

Before diving into select/poll/epoll, the article reviews the concept of IO multiplexing: a single process handling many socket descriptors, sleeping when none are readable and being woken up when at least one becomes readable.

IO Multiplexing Essence

In communication engineering, multiplexing means transmitting multiple signals over a single channel to improve resource utilization; in the kernel it means a process can monitor N sockets and act only when a socket is ready.

Select Implementation

Select monitors a fixed‑size fd set (max 1024 by default) and copies the set between user and kernel space on each call, leading to high overhead.

int select(int maxfd1, fd_set *readset, fd_set *writeset,
           fd_set *exceptset, const struct timeval *timeout);

int pselect(int maxfd1, fd_set *readset, fd_set *writeset,
            fd_set *exceptset, const struct timespec *timeout,
            const sigset_t *sigmask);

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
    struct poll_wqueues table;
    poll_table *wait;
    poll_initwait(&table);
    wait = &table.pt;
    // ... copy fd set from user, prepare poll, loop over fds ...
    for (;;) {
        // check each fd for readiness
        if (condition_met) {
            // update event mask
            fd_sock.event |= POLLIN;
        }
        if (retval || timed_out || signal_pending(current))
            break;
        if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    poll_freewait(&table);
}

static int __pollwake(wait_queue_entry_t *wait, unsigned mode,
                       int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
    smp_wmb();
    pwq->triggered = 1;
    return default_wake_function(&dummy_wait, mode, sync, key);
}

Poll Implementation

Poll replaces the fixed array with a linked list, removing the FD_SETSIZE limit but still suffers from large user‑kernel copies and full‑queue traversal.

int poll(struct pollfd *fds, unsigned long nfds, int timeout);

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                       struct timespec64 *end_time)
{
    // copy from user, init wait queue
    poll_initwait(&table);
    fdcount = do_poll(head, &table, end_time);
    poll_freewait(&table);
}

Epoll Implementation

Epoll solves the copy‑overhead and FD limit by creating an epoll instance that stores a single kernel‑side data structure (a red‑black tree) for all registered sockets.

int epoll_create(int size);
int epoll_create1(int flags);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event *events,
               int maxevents, int timeout);

static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                    struct file *tfile, int fd, int full_check)
{
    struct epitem *epi;
    // initialize epitem, link to epoll instance
    epi->event = *event;
    // add to red‑black tree
    ep_rbtree_insert(ep, epi);
    // if ready, add to ready list and wake up waiters
    if (revents && !ep_is_linked(epi)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        wake_up(&ep->wq);
    }
    return 0;
}

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
{
    for (;;) {
        if (ep_events_available(ep))
            break;
        if (signal_pending(current)) {
            res = -EINTR;
            break;
        }
        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
            timed_out = 1;
            break;
        }
    }
    ep_send_events(ep, events, maxevents);
}

Edge vs Level Triggering

Level‑triggered events fire as long as the socket buffer remains readable/writable, causing repeated wake‑ups. Edge‑triggered events fire only when the buffer state changes, reducing the number of wake‑ups.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

kernel Linux epoll poll select IO Multiplexing wait queue

Written by

Xiaokun's Architecture Exploration Notes

10 years of backend architecture design | AI engineering infrastructure, storage architecture design, and performance optimization | Former senior developer at NetEase, Douyu, Inke, etc.

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.