Mastering Linux Socket I/O: Blocking, Non‑Blocking, and Epoll Explained
This article explains the two‑stage socket I/O process in Linux, compares blocking, non‑blocking, and I/O multiplexing techniques, details the select, poll, and epoll APIs with their advantages and drawbacks, and provides complete C code examples for a high‑performance TCP server and client.
Socket I/O Processing
Each socket I/O operation consists of two stages: the data‑preparation stage, where the kernel receives the packet and makes it ready, and the data‑copy stage, where the application copies data from kernel space to userspace.
Blocking I/O (Blocking I/O)
Blocking means the calling thread is suspended until the operation completes, preventing it from doing other work. By default, socket I/O is blocking.
During the preparation stage, the application calling recvfrom() waits for enough data, causing the thread to block. In the copy stage, the kernel copies data to the application buffer, after which the thread becomes ready.
Drawbacks of blocking I/O :
High concurrency requires many threads, consuming large system resources.
After a TCP connection is established, if no data is available, the thread blocks on recvfrom(), wasting thread resources.
Non‑Blocking I/O (Non‑Blocking I/O)
Non‑blocking I/O returns immediately without waiting for data readiness.
Set the socket to non‑blocking mode with fcntl() and the O_NONBLOCK flag.
Data‑preparation stage : If the kernel buffer lacks data, the call returns EWOULDBLOCK. The application knows the data is not ready and can continue other tasks.
Data‑copy stage : Once data is ready, it is copied normally.
Drawbacks of non‑blocking I/O :
The application continuously polls the kernel, wasting CPU cycles.
Frequent read() calls cause many system calls, increasing overhead.
I/O Multiplexing (I/O Multiplexing)
Multiplexing allows a single server socket to handle multiple client sockets, improving performance by avoiding per‑connection blocking.
The Linux kernel provides three interfaces: select(), poll(), and epoll(), with epoll() offering the best performance.
select()
Function prototype :
int select(int n, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);Drawbacks :
Limited number of file descriptors (typically 1024) unless kernel parameters are changed.
Each call copies all fd sets between user and kernel space, incurring overhead.
The kernel marks each fd, and the application must iterate over all fds to handle events.
poll()
Function prototype :
#include <poll.h>
int poll(struct pollfd *fds, unsigned int nfds, int timeout);Improvements over select() :
No hard limit on the number of fds.
Event‑driven model allowing per‑fd event specification.
The pollfd structure:
struct pollfd {
int fd; // monitored file descriptor
short events; // events of interest
short revents; // events that occurred
};Drawbacks :
Still copies large fd sets between user and kernel space.
Application must still iterate over all fds.
epoll
epoll solves the shortcomings of select() and poll() by keeping fd sets inside the kernel and using event‑driven callbacks.
Workflow :
Create an epoll instance with epoll_create() (or epoll_create1()).
Register fds and their events using epoll_ctl() (add, modify, delete).
Wait for events with epoll_wait(), which returns ready events.
epoll_work_modes
Two modes are available:
LT (Level Trigger) : Default mode; events are reported as long as the condition holds.
ET (Edge Trigger) : Reports events only when the state changes; requires non‑blocking sockets and offers higher efficiency.
epoll_create()
int epoll_create(int size);epoll_ctl()
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);op values : EPOLL_CTL_ADD – add fd EPOLL_CTL_DEL – delete fd EPOLL_CTL_MOD – modify fd
epoll_event structure
struct epoll_event {
__uint32_t events; // epoll events
epoll_data_t data; // user data
};Common event flags: EPOLLIN, EPOLLOUT, EPOLLPRI, EPOLLERR, EPOLLHUP, EPOLLET, EPOLLONESHOT.
epoll_wait()
int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);TCP Socket Optimization Example Using epoll and Non‑Blocking I/O
Server
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#define ERR_MSG(err_code) do { err_code = errno; fprintf(stderr, "ERROR code: %d
", err_code); perror("PERROR message"); } while (0)
#define MAX_EVENTS 10
#define BUFFER_SIZE 1024
static int set_sock_non_blocking(int sock_fd) {
int flags = fcntl(sock_fd, F_GETFL, 0);
if (flags == -1) { perror("fcntl"); return -1; }
flags |= O_NONBLOCK;
if (fcntl(sock_fd, F_SETFL, flags) == -1) { perror("fcntl"); return -1; }
return 0;
}
int main(void) {
struct sockaddr_in srv_sock_addr;
memset(&srv_sock_addr, 0, sizeof(srv_sock_addr));
srv_sock_addr.sin_family = AF_INET;
srv_sock_addr.sin_addr.s_addr = htonl(INADDR_ANY);
srv_sock_addr.sin_port = htons(8086);
int srv_socket_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (srv_socket_fd == -1) { printf("Create socket file descriptor ERROR.
"); ERR_MSG(errno); exit(EXIT_FAILURE); }
int optval = 1;
if (setsockopt(srv_socket_fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) < 0) {
printf("Set socket options ERROR.
"); ERR_MSG(errno); exit(EXIT_FAILURE);
}
if (bind(srv_socket_fd, (struct sockaddr *)&srv_sock_addr, sizeof(srv_sock_addr)) == -1) {
printf("Bind socket ERROR.
"); ERR_MSG(errno); exit(EXIT_FAILURE);
}
if (listen(srv_socket_fd, 10) == -1) { printf("Listen socket ERROR.
"); ERR_MSG(errno); exit(EXIT_FAILURE); }
if (set_sock_non_blocking(srv_socket_fd) == -1) { printf("set_sock_non_blocking() error."); ERR_MSG(errno); close(srv_socket_fd); }
int epoll_fd = epoll_create1(0);
if (epoll_fd == -1) { printf("epoll_create error."); ERR_MSG(errno); exit(EXIT_FAILURE); }
struct epoll_event event, events[MAX_EVENTS];
event.data.fd = srv_socket_fd;
event.events = EPOLLIN | EPOLLET;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, srv_socket_fd, &event) == -1) { printf("epoll_ctl error."); ERR_MSG(errno); exit(EXIT_FAILURE); }
printf("Starting TCP server.
");
while (1) {
int event_cnt = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
if (event_cnt == -1) { printf("epoll_wait error.
"); ERR_MSG(errno); exit(EXIT_FAILURE); }
for (int i = 0; i < event_cnt; ++i) {
if (srv_socket_fd == events[i].data.fd) {
printf("Accepted client connection request.
");
while (1) {
struct sockaddr cli_sock_addr;
memset(&cli_sock_addr, 0, sizeof(cli_sock_addr));
socklen_t cli_sockaddr_len = sizeof(cli_sock_addr);
int cli_socket_fd = accept(srv_socket_fd, &cli_sock_addr, &cli_sockaddr_len);
if (cli_socket_fd == -1) {
if (errno == EAGAIN || errno == EWOULDBLOCK) break; else { printf("Accept connection from client ERROR.
"); break; }
}
if (set_sock_non_blocking(cli_socket_fd) == -1) { printf("set_sock_non_blocking() error."); close(cli_socket_fd); break; }
event.data.fd = cli_socket_fd;
event.events = EPOLLIN | EPOLLET;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, cli_socket_fd, &event) == -1) { printf("epoll_ctl() error."); close(cli_socket_fd); break; }
}
} else if (events[i].events & EPOLLIN) {
int cli_socket_fd = events[i].data.fd;
char buff[BUFFER_SIZE];
int recv_len = recv(cli_socket_fd, buff, BUFFER_SIZE, 0);
if (recv_len < 0) { printf("Receive from client ERROR.
"); close(cli_socket_fd); break; }
printf("Recevice data from client: %s
", buff);
send(cli_socket_fd, buff, recv_len, 0);
printf("Send data to client: %s
", buff);
close(cli_socket_fd);
memset(buff, 0, BUFFER_SIZE);
} else if (events[i].events & (EPOLLERR | EPOLLHUP | !(events[i].events & EPOLLIN))) {
printf("epoll error."); close(events[i].data.fd); break;
}
}
}
close(srv_socket_fd);
return EXIT_SUCCESS;
}Client
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#define ERR_MSG(err_code) do { err_code = errno; fprintf(stderr, "ERROR code: %d
", err_code); perror("PERROR message"); } while (0)
const int BUF_LEN = 100;
int main(void) {
struct sockaddr_in srv_sock_addr;
memset(&srv_sock_addr, 0, sizeof(srv_sock_addr));
srv_sock_addr.sin_family = AF_INET;
srv_sock_addr.sin_addr.s_addr = inet_addr("192.168.1.3");
srv_sock_addr.sin_port = htons(8086);
int cli_socket_fd;
char send_buff[BUF_LEN];
char recv_buff[BUF_LEN];
while (1) {
if ((cli_socket_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) { printf("Create socket ERROR.
"); ERR_MSG(errno); exit(EXIT_FAILURE); }
if (connect(cli_socket_fd, (struct sockaddr *)&srv_sock_addr, sizeof(srv_sock_addr)) == -1) { printf("Connect to server ERROR.
"); ERR_MSG(errno); exit(EXIT_FAILURE); }
fputs("Send to server> ", stdout);
fgets(send_buff, BUF_LEN, stdin);
send(cli_socket_fd, send_buff, BUF_LEN, 0);
memset(send_buff, 0, BUF_LEN);
recv(cli_socket_fd, recv_buff, BUF_LEN, 0);
printf("Recevice from server: %s
", recv_buff);
memset(recv_buff, 0, BUF_LEN);
close(cli_socket_fd);
}
return EXIT_SUCCESS;
}Test Commands
$ gcc -g -std=c99 -Wall tcp_server.c -o tcp_server
$ gcc -g -std=c99 -Wall tcp_client.c -o tcp_client
$ ./tcp_server
$ ./tcp_clientHow this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
