How Linux epoll Boosts I/O Performance: Inside the Kernel’s Eventpoll Mechanism
This article explains why traditional select and poll struggle with many file descriptors, how epoll redesigns the workflow by registering descriptors once via epoll_ctl, the kernel data structures (eventpoll, epitem) that manage events, and walks through the core functions epoll_create, epoll_ctl, epoll_wait, and their callbacks.
When monitoring a large number of file descriptors, both select and poll become inefficient because each call must copy all descriptors to the kernel and the kernel does not retain them, leading to high overhead.
Linux epoll solves this by moving the descriptor registration to epoll_ctl, which stores the descriptors inside the kernel once, and then only waits for events, eliminating repeated copying.
epoll Initialization
During system boot, the kernel initializes epoll with the following function:
static int __init eventpoll_init(void)
{
mutex_init(&epmutex);
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_poll_safewake_init(&psw);
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0,
SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0,
EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
return 0;
}
fs_initcall(eventpoll_init);The code sets up three lock mechanisms used later: epmutex (mutex): protects the case where a user closes a file descriptor without calling EPOLL_CTL_DEL. ep->mtx (mutex): guards transitions between user and kernel space that may sleep. ep->lock (spinlock): protects the kernel‑side data structures during interrupt handling.
Kernel Data Structures
Two main structures are maintained:
struct epitem {
struct rb_node rbn; /* node in the eventpoll RB tree */
struct list_head rdllink; /* link in ready list */
struct epitem *next;
struct epoll_filefd ffd; /* fd + file, key of the RB tree */
int nwait; /* number of poll wait queues */
struct list_head pwqlist; /* list of poll wait queues */
struct eventpoll *ep; /* back‑pointer to owning eventpoll */
struct list_head fllink; /* link in file->f_ep_links */
struct epoll_event event; /* user‑requested events */
};
struct eventpoll {
spinlock_t lock; /* protects ready list */
struct mutex mtx; /* prevents file removal while in use */
wait_queue_head_t wq; /* wait queue for sys_epoll_wait */
wait_queue_head_t poll_wait; /* wait queue for file->poll */
struct list_head rdllist; /* ready file descriptors */
struct rb_root rbr; /* RB tree of monitored fds */
struct epitem *ovflist; /* overflow list for events */
};Creating an epoll Instance
The system call sys_epoll_create allocates an eventpoll object, creates an anonymous inode, and stores the pointer in the file’s private data:
asmlinkage long sys_epoll_create(int size)
{
int error, fd = -1;
struct eventpoll *ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)
", current, size));
error = -EINVAL;
if (size <= 0 || (error = ep_alloc(&ep)) < 0) {
fd = error;
goto error_return;
}
fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
if (fd < 0)
ep_free(ep);
error_return:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d
", current, size, fd));
return fd;
}Registering and Modifying File Descriptors (epoll_ctl)
sys_epoll_ctlvalidates arguments, retrieves the target eventpoll and the file to be monitored, and then dispatches based on the operation (ADD, MOD, DEL). The core insertion logic lives in ep_insert:
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
error = -ENOMEM;
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
goto error_return;
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
revents = tfile->f_op->poll(tfile, &epq.pt);
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);
ep_rbtree_insert(ep, epi);
spin_lock_irqsave(&ep->lock, flags);
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)
", current, ep, tfile, fd));
return 0;
error_return:
return error;
}The insertion registers a poll callback via init_poll_funcptr, which stores ep_ptable_queue_proc in the poll_table_struct used by the file’s poll method.
Poll Callback Registration
static void ep_ptable_queue_proc(struct file *file,
wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
epi->nwait = -1; /* allocation error */
}
}The callback ep_poll_callback is invoked when the device’s wait queue wakes up, moving the associated epitem to the ready list and waking any processes sleeping in epoll_wait:
static int ep_poll_callback(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
unsigned long flags;
int pwake = 0;
spin_lock_irqsave(&ep->lock, flags);
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
if (ep_is_linked(&epi->rdllink))
goto is_linked;
list_add_tail(&epi->rdllink, &ep->rdllist);
is_linked:
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
spin_unlock_irqrestore(&ep->lock, flags);
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
return 1;
}Waiting for Events (epoll_wait)
The system call sys_epoll_wait validates parameters, obtains the eventpoll object, and delegates to ep_poll:
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct file *file;
struct eventpoll *ep;
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
error = -EFAULT;
goto error_return;
}
file = fget(epfd);
if (!file)
goto error_return;
if (!is_file_epoll(file)) {
error = -EINVAL;
goto error_fput;
}
ep = file->private_data;
error = ep_poll(ep, events, maxevents, timeout);
error_fput:
fput(file);
error_return:
return error;
}Core Poll Loop (ep_poll)
ep_pollconverts the timeout from milliseconds to jiffies, then either returns ready events immediately or sleeps on the eventpoll wait queue until an event arrives or the timeout expires. It handles spurious wake‑ups, signal interruptions, and retries if events become ready after a wake‑up.
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail;
unsigned long flags;
long jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
wait_queue_t wait;
retry:
spin_lock_irqsave(&ep->lock, flags);
if (list_empty(&ep->rdllist)) {
init_waitqueue_entry(&wait, current);
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue(&ep->wq, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&ep->rdllist) || !jtimeout)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
spin_unlock_irqrestore(&ep->lock, flags);
jtimeout = schedule_timeout(jtimeout);
spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
eavail = !list_empty(&ep->rdllist);
spin_unlock_irqrestore(&ep->lock, flags);
if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
goto retry;
return res;
}Sending Events to Userspace (ep_send_events)
ep_send_eventspackages ready events from the rdllist (or overflow list) into a temporary structure and copies them to the user buffer supplied to epoll_wait. It then clears the ready list entries that have been delivered.
Overall, the epoll subsystem replaces the per‑call copying model of select / poll with a persistent kernel‑side registration, a red‑black tree for fast lookup, and a ready‑list mechanism that wakes waiting processes only when necessary, providing scalable I/O event notification for large numbers of file descriptors.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Liangxu Linux
Liangxu, a self‑taught IT professional now working as a Linux development engineer at a Fortune 500 multinational, shares extensive Linux knowledge—fundamentals, applications, tools, plus Git, databases, Raspberry Pi, etc. (Reply “Linux” to receive essential resources.)
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
