Operations 34 min read

How to Supercharge Linux UDP Receive Performance: Kernel Tweaks and Code Optimizations

This article dissects the Linux kernel's UDP receive path, identifies key bottlenecks such as small socket buffers, system‑call overhead, interrupt storms, and application‑level processing limits, and then presents concrete kernel parameter tweaks, memory‑pool designs, multithreaded I/O, and SO_REUSEPORT techniques with full C code examples to dramatically boost throughput and reduce packet loss.

Deepin Linux

Mar 8, 2026

How to Supercharge Linux UDP Receive Performance: Kernel Tweaks and Code Optimizations

UDP Basics

UDP is a connection‑less transport protocol with an 8‑byte header. It provides low latency at the cost of reliability, making it suitable for live video, online gaming, IoT telemetry, and other real‑time workloads.

Receive Path in the Linux Kernel

When a NIC receives a frame, DMA writes the data into a pre‑allocated buffer and raises a hardware interrupt. The driver disables further NIC interrupts, schedules the soft‑interrupt net_rx_action, and optionally merges packets with Generic Receive Offload (GRO). The packet is handed to __netif_receive_skb_core, passes through IP processing ( ip_rcv → ip_rcv_finish), and reaches __udp4_lib_lookup_skb, which looks up the matching socket and enqueues the packet in the socket’s receive queue. The application finally reads the data with recvfrom().

Key Performance Bottlenecks

Kernel receive buffer size – net.core.rmem_default is often only a few hundred KiB. Under bursty traffic the buffer overflows, causing packet loss.

System‑call overhead – Each recvfrom() incurs a user↔kernel context switch and a copy from kernel to user space.

Interrupt load – High packet rates generate a flood of hardware interrupts, consuming CPU cycles.

Application processing – Slow user‑space handling (parsing, synchronization) leads to queue buildup and drops.

Optimization Strategies

1. Kernel Parameter Tuning

Increase the socket receive buffer and the system limits:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>

#define BUFFER_SIZE (16*1024*1024)   // 16 MiB

int main() {
    int fd = socket(AF_INET, SOCK_DGRAM, 0);
    if (fd == -1) { perror("socket"); exit(EXIT_FAILURE); }
    struct sockaddr_in addr = {0};
    addr.sin_family = AF_INET;
    addr.sin_addr.s_addr = htonl(INADDR_ANY);
    addr.sin_port = htons(8080);
    if (bind(fd, (struct sockaddr*)&addr, sizeof(addr))) { perror("bind"); exit(EXIT_FAILURE); }
    int size = BUFFER_SIZE;
    if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size))) {
        perror("setsockopt"); exit(EXIT_FAILURE);
    }
    printf("UDP server started, receive buffer = 16 MiB
");
    /* receive loop */
    close(fd);
    return 0;
}

Persist the limits system‑wide by adding to /etc/sysctl.conf:

net.core.rmem_max = 16777216
net.core.rmem_default = 16777216

Apply with sysctl -p.

2. Non‑Blocking I/O & Multi‑Threading

Switch the socket to non‑blocking mode and let a pool of worker threads poll recvfrom(). This hides the latency of empty reads and utilizes all CPU cores.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <pthread.h>
#include <fcntl.h>

#define PORT 8080
#define BUF_SIZE 1024
#define WORKERS 10

int server_fd;

void* worker(void* arg) {
    struct sockaddr_in client;
    socklen_t cl_len = sizeof(client);
    char buf[BUF_SIZE];
    while (1) {
        ssize_t n = recvfrom(server_fd, buf, sizeof(buf)-1, 0,
                             (struct sockaddr*)&client, &cl_len);
        if (n == -1) {
            if (errno == EAGAIN || errno == EWOULDBLOCK) { usleep(100); continue; }
            perror("recvfrom"); continue;
        }
        buf[n] = '\0';
        printf("Thread %ld received: %s
", (long)arg, buf);
    }
    return NULL;
}

int main() {
    struct sockaddr_in addr = {0};
    server_fd = socket(AF_INET, SOCK_DGRAM, 0);
    if (server_fd == -1) { perror("socket"); exit(EXIT_FAILURE); }
    addr.sin_family = AF_INET;
    addr.sin_addr.s_addr = htonl(INADDR_ANY);
    addr.sin_port = htons(PORT);
    if (bind(server_fd, (struct sockaddr*)&addr, sizeof(addr))) { perror("bind"); exit(EXIT_FAILURE); }
    int flags = fcntl(server_fd, F_GETFL, 0);
    fcntl(server_fd, F_SETFL, flags | O_NONBLOCK);
    pthread_t th[WORKERS];
    for (long i = 0; i < WORKERS; ++i) {
        if (pthread_create(&th[i], NULL, worker, (void*)i)) { perror("pthread_create"); exit(EXIT_FAILURE); }
    }
    for (int i = 0; i < WORKERS; ++i) pthread_join(th[i], NULL);
    close(server_fd);
    return 0;
}

3. Custom Buffer Pool

Allocate a pool of fixed‑size buffers to avoid per‑packet malloc() / free() overhead. The pool is protected by a mutex and can be reused by all worker threads.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <pthread.h>

#define PORT 8080
#define POOL_BUF_SIZE 65536
#define POOL_CAP 100

typedef struct BufferNode {
    char buf[POOL_BUF_SIZE];
    struct BufferNode* next;
} BufferNode;

typedef struct {
    BufferNode* head;
    pthread_mutex_t mtx;
} BufferPool;

static BufferPool g_pool;

void pool_init() { g_pool.head = NULL; pthread_mutex_init(&g_pool.mtx, NULL); }

char* pool_get() {
    pthread_mutex_lock(&g_pool.mtx);
    BufferNode* n = g_pool.head;
    if (n) { g_pool.head = n->next; pthread_mutex_unlock(&g_pool.mtx); return n->buf; }
    pthread_mutex_unlock(&g_pool.mtx);
    n = malloc(sizeof(BufferNode));
    if (!n) { perror("malloc"); exit(EXIT_FAILURE); }
    return n->buf;
}

void pool_put(char* p) {
    if (!p) return;
    BufferNode* n = (BufferNode*)((char*)p - offsetof(BufferNode, buf));
    pthread_mutex_lock(&g_pool.mtx);
    int cnt = 0; BufferNode* t = g_pool.head;
    while (t) { cnt++; if (cnt >= POOL_CAP) { pthread_mutex_unlock(&g_pool.mtx); free(n); return; } t = t->next; }
    n->next = g_pool.head; g_pool.head = n;
    pthread_mutex_unlock(&g_pool.mtx);
}

int main() {
    pool_init();
    int fd = socket(AF_INET, SOCK_DGRAM, 0);
    struct sockaddr_in srv = {0}, cli; socklen_t cli_len = sizeof(cli);
    srv.sin_family = AF_INET; srv.sin_addr.s_addr = htonl(INADDR_ANY); srv.sin_port = htons(PORT);
    bind(fd, (struct sockaddr*)&srv, sizeof(srv));
    while (1) {
        char* buf = pool_get();
        ssize_t n = recvfrom(fd, buf, POOL_BUF_SIZE-1, 0, (struct sockaddr*)&cli, &cli_len);
        if (n == -1) { perror("recvfrom"); pool_put(buf); continue; }
        buf[n] = '\0';
        printf("Received: %s
", buf);
        pool_put(buf);
    }
    close(fd);
    return 0;
}

4. SO_REUSEPORT for Multi‑Process Scaling

Enabling SO_REUSEPORT allows several processes to bind the same IP/port. The kernel distributes incoming packets among them, providing near‑linear scaling on multi‑core systems.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <errno.h>
#include <sys/wait.h>

#define PORT 8080
#define BUF_SIZE 1024
#define PROCS 4

int create_socket() {
    int fd = socket(AF_INET, SOCK_DGRAM, 0);
    if (fd == -1) { perror("socket"); exit(EXIT_FAILURE); }
    int opt = 1;
    if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) { perror("setsockopt"); exit(EXIT_FAILURE); }
    struct sockaddr_in a = {0};
    a.sin_family = AF_INET; a.sin_addr.s_addr = htonl(INADDR_ANY); a.sin_port = htons(PORT);
    if (bind(fd, (struct sockaddr*)&a, sizeof(a))) { perror("bind"); exit(EXIT_FAILURE); }
    return fd;
}

void child_loop(int id) {
    int fd = create_socket();
    char buf[BUF_SIZE];
    struct sockaddr_in cli; socklen_t cli_len = sizeof(cli);
    printf("Child %d listening on %d
", id, PORT);
    while (1) {
        ssize_t n = recvfrom(fd, buf, sizeof(buf)-1, 0, (struct sockaddr*)&cli, &cli_len);
        if (n == -1) { perror("recvfrom"); continue; }
        buf[n] = '\0';
        printf("Process %d got: %s
", id, buf);
    }
    close(fd);
}

int main() {
    for (int i = 0; i < PROCS; ++i) {
        pid_t pid = fork();
        if (pid == 0) { child_loop(i); exit(0); }
        if (pid < 0) { perror("fork"); exit(EXIT_FAILURE); }
    }
    while (wait(NULL) > 0);
    return 0;
}

5. Kernel‑Level Enhancements

Generic Receive Offload (GRO) – Merges multiple small UDP packets of the same flow into a larger skb, reducing interrupt frequency and per‑packet processing. Benchmarks show CPU usage dropping from ~80 % to ~30 % and throughput increasing four‑fold.

Dual‑Queue Buffering – Splits the receive path into a sampling queue (filled by interrupt context) and a processing queue (consumed by process context). This eliminates lock contention and reduces average packet‑processing latency from ~10 ms to ~2 ms in high‑load game servers.

Socket buffer size – Raising SO_RCVBUF from 8 KB to 64 KB can cut loss rates from 10 % to 1 % under burst traffic.

Timeout tuning – Adjust SO_RCVTIMEO / SO_SNDTIMEO to balance latency and retransmission behavior.

Checksum offload to soft‑interrupts – Moves checksum calculation out of the hard‑interrupt path, freeing CPU cycles for other work.

Combining these kernel‑level tweaks with the user‑space techniques above yields a high‑throughput, low‑latency UDP service suitable for live streaming, online gaming, real‑time telemetry, DNS, and other latency‑sensitive applications.

Optimization Kernel Linux Networking UDP

Written by

Deepin Linux

Research areas: Windows & Linux platforms, C/C++ backend development, embedded systems and Linux kernel, etc.

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.