Fundamentals 21 min read

Deep Dive into Linux epoll: Data Structures, Core Functions, and Implementation Details

This article provides a comprehensive analysis of Linux's epoll mechanism, covering its fundamental data structures, key kernel functions, wait‑queue handling, event insertion, wake‑up logic, and the differences between edge‑triggered and level‑triggered modes, while including full source code excerpts for reference.

360 Tech Engineering

Dec 20, 2019

Deep Dive into Linux epoll: Data Structures, Core Functions, and Implementation Details

epoll is Linux's I/O multiplexing implementation that uses more complex kernel data structures than select, but its core principles are straightforward; this article extracts the essential concepts and walks through the implementation details.

Primary Data Structures

The central structure is struct eventpoll, which contains a mutex, wait queues, a red‑black tree for monitored file descriptors, a list for ready events, and various helper fields. The definition is shown below:

struct eventpoll {
    struct mutex mtx;               // protects the structure
    wait_queue_head_t wq;            // wait queue for epoll_wait
    wait_queue_head_t poll_wait;    // poll wait queue for file->poll()
    struct list_head rdllist;       // list of ready fds
    rwlock_t lock;                  // protects rdllist and ovflist
    struct rb_root_cached rbr;      // red‑black tree of monitored fds
    struct epitem *ovflist;         // overflow list for ready fds added during copy
    struct wakeup_source *ws;       // wake‑up source for EPOLLWAKEUP
    struct user_struct *user;      // user who created the epoll descriptor
    struct file *file;               // associated file object
    int visited;
    struct list_head visited_list_link;
#ifdef CONFIG_NET_RX_BUSY_POLL
    unsigned int napi_id;
#endif
};

The struct epitem represents each monitored file descriptor and links it into the red‑black tree and ready‑list:

struct epitem {
    union {
        struct rb_node rbn;   // RB‑tree node
        struct rcu_head rcu;  // for RCU cleanup
    };
    struct list_head rdllink;   // link in rdllist
    struct epitem *next;         // link in ovflist
    struct epoll_filefd ffd;    // fd information
    int nwait;                  // number of wait queues attached
    struct list_head pwqlist;    // list of poll wait queues
    struct eventpoll *ep;        // back‑pointer to eventpoll
    struct list_head fllink;    // link in file's list
    struct wakeup_source __rcu *ws; // wake‑up source if EPOLLWAKEUP
    struct epoll_event event;   // user‑specified events
};

The struct epoll_event passed to epoll_ctl carries the event mask and user data. Two definitions exist: one in the kernel (

struct epoll_event { __poll_t events; __u64 data; } EPOLL_PACKED;

) and one in glibc (

typedef union epoll_data { void *ptr; int fd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_t events; epoll_data_t data; } __EPOLL_PACKED;

Main Functions epoll_create creates an epoll instance, allocating an eventpoll object, obtaining an unused file descriptor, and linking the object to an anonymous inode. The core implementation is:

static int do_epoll_create(int flags)
{
    int error, fd;
    struct eventpoll *ep = NULL;
    struct file *file;
    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;
    error = ep_alloc(&ep);
    if (error < 0)
        return error;
    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    if (fd < 0) {
        error = fd;
        goto out_free_ep;
    }
    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                 O_RDWR | (flags & O_CLOEXEC));
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto out_free_fd;
    }
    ep->file = file;
    fd_install(fd, file);
    return fd;
out_free_fd:
    put_unused_fd(fd);
out_free_ep:
    ep_free(ep);
    return error;
}

epoll_ctl

adds, modifies, or deletes a file descriptor from an epoll set. It validates arguments, copies the user‑provided epoll_event, checks for poll support, handles EPOLLEXCLUSIVE, and calls helper functions such as ep_insert, ep_remove, or ep_modify:

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)
{
    int error = -EFAULT;
    struct fd f, tf;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;
    if (ep_op_has_event(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))
        goto error_return;
    f = fdget(epfd);
    if (!f.file)
        goto error_return;
    tf = fdget(fd);
    if (!tf.file)
        goto error_fput;
    if (!file_can_poll(tf.file)) {
        error = -EPERM;
        goto error_tgt_fput;
    }
    // ... EPOLLEXCLUSIVE handling omitted for brevity ...
    ep = f.file->private_data;
    epi = ep_find(ep, tf.file, fd);
    error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= EPOLLERR | EPOLLHUP;
            error = ep_insert(ep, &epds, tf.file, fd, full_check);
        } else {
            error = -EEXIST;
        }
        break;
    case EPOLL_CTL_DEL:
        error = epi ? ep_remove(ep, epi) : -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) {
            if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                epds.events |= EPOLLERR | EPOLLHUP;
                error = ep_modify(ep, epi, &epds);
            }
        } else {
            error = -ENOENT;
        }
        break;
    }
    // ... error handling and return ...
    return error;
}

ep_insert

allocates an epitem, initializes its list heads, sets the event mask, and, if EPOLLWAKEUP is requested, creates a wake‑up source. It then links the item into the red‑black tree and, if the event is already ready, adds it to the ready list.

if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
    return -ENOMEM;
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
    error = ep_create_wakeup_source(epi);
    if (error)
        goto error_create_wakeup_source;
} else {
    RCU_INIT_POINTER(epi->ws, NULL);
}

The waiting mechanism relies on Linux wait queues. When epoll_wait (implemented by ep_poll) is called, the current task is added to eventpoll->wq via __add_wait_queue_exclusive. The kernel then enters an infinite loop that sleeps until either a signal arrives, a ready event is detected, or the timeout expires.

if (!waiter) {
    waiter = true;
    init_waitqueue_entry(&wait, current);
    spin_lock_irq(&ep->wq.lock);
    __add_wait_queue_exclusive(&ep->wq, &wait);
    spin_unlock_irq(&ep->wq.lock);
}
for (;;) {
    set_current_state(TASK_INTERRUPTIBLE);
    if (fatal_signal_pending(current)) { res = -EINTR; break; }
    eavail = ep_events_available(ep);
    if (eavail) break;
    if (signal_pending(current)) { res = -EINTR; break; }
    if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
        timed_out = 1; break;
    }
}

When a monitored file descriptor becomes ready, ep_poll_callback is invoked via the wait‑queue entry added by ep_ptable_queue_proc. The callback adds the corresponding epitem to the ready list ( rdllist) and wakes up the sleeping task.

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
               poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;
    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
            add_wait_queue_exclusive(whead, &pwq->wait);
        else
            add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        epi->nwait = -1;
    }
}

The kernel tracks whether events are available via ep_events_available, which checks if rdllist is non‑empty or if the overflow list ovflist contains pending items.

static inline int ep_events_available(struct eventpoll *ep)
{
    return !list_empty_careful(&ep->rdllist) ||
           READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}

Finally, ep_send_events copies ready events from the kernel to user space. If the ready list is empty but the overflow list holds items (added while copying), those items are atomically chained onto ovflist using chain_epi_lockless.

static inline bool chain_epi_lockless(struct epitem *epi)
{
    if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
        return false;
    epi->next = xchg(&ep->ovflist, epi);
    return true;
}

In summary, the epoll implementation combines a red‑black tree for fast lookup, per‑fd wait‑queue entries for event notification, and careful handling of edge‑triggered versus level‑triggered semantics to avoid the "thundering herd" problem while providing scalable I/O multiplexing for large numbers of sockets.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

Kernel Linux System Programming I/O Multiplexing Event-driven epoll

Written by

360 Tech Engineering

Official tech channel of 360, building the most professional technology aggregation platform for the brand.

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.