Operations 13 min read

How to Boost Nginx to Over 1 Million QPS: Real‑World Optimization Guide

This guide walks you through a complete Nginx high‑concurrency tuning process—from basic worker and TCP settings to kernel parameters, caching, SSL, and advanced techniques like DPDK and JIT—showing performance improvements from 80 k to over a million QPS with real‑world examples and scripts.

Raymond Ops
Raymond Ops
Raymond Ops
How to Boost Nginx to Over 1 Million QPS: Real‑World Optimization Guide

Performance Overview

Default Nginx configuration yields ~80 k QPS, 125 ms latency, 85 % CPU, 2.1 GB RAM. After successive optimizations the same hardware can handle >1.2 M QPS with 8 ms latency and 35 % CPU.

Stage 1 – Core Nginx Settings

Worker processes

# Auto‑detect CPU cores
worker_processes auto;
worker_cpu_affinity auto;
events {
    worker_connections 65535;
    use epoll;
    multi_accept on;
}

TCP parameters

http {
    tcp_nodelay on;
    tcp_nopush on;
    keepalive_timeout 65;
    keepalive_requests 10000;
    client_max_body_size 20m;
    client_body_buffer_size 128k;
    client_header_buffer_size 4k;
    large_client_header_buffers 8 8k;
}

Stage 2 – Kernel & System Limits

sysctl.conf

net.core.somaxconn = 65535
net.core.netdev_max_backlog = 30000
net.ipv4.tcp_max_syn_backlog = 65535
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_fin_timeout = 10
net.core.rmem_max = 67108864
net.core.wmem_max = 67108864
net.ipv4.tcp_rmem = 4096 87380 67108864
net.ipv4.tcp_wmem = 4096 65536 67108864
fs.file-max = 6815744

Limits for the nginx user

# /etc/security/limits.conf
nginx soft nofile 655350
nginx hard nofile 655350
nginx soft nproc 655350
nginx hard nproc 655350

Add LimitNOFILE=655350 and LimitNPROC=655350 to the nginx.service unit.

Stage 3 – Caching & Compression

Static file cache

location ~* \.(jpg|jpeg|png|gif|ico|css|js|pdf|txt)$ {
    expires 1y;
    add_header Cache-Control "public, immutable";
    add_header Pragma "cache";
    gzip_static on;
    access_log off;
    sendfile on;
    sendfile_max_chunk 1m;
}

Dynamic compression

# Gzip
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_comp_level 6;
gzip_types text/plain text/css text/xml text/javascript application/json application/javascript application/xml+rss application/atom+xml;

# Brotli (requires compiled module)
brotli on;
brotli_comp_level 6;
brotli_types text/plain text/css application/json application/javascript;

Stage 4 – Advanced Optimizations

Upstream connection pool

upstream backend {
    least_conn;
    server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=3 fail_timeout=30s;
    keepalive 300;
    keepalive_requests 1000;
    keepalive_timeout 60s;
}
server {
    location / {
        proxy_pass http://backend;
        proxy_http_version 1.1;
        proxy_set_header Connection "";
        proxy_buffering on;
        proxy_buffer_size 128k;
        proxy_buffers 8 128k;
        proxy_busy_buffers_size 256k;
        proxy_connect_timeout 5s;
        proxy_send_timeout 10s;
        proxy_read_timeout 10s;
    }
}

SSL/TLS tuning

ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256;
ssl_prefer_server_ciphers off;
ssl_session_cache shared:SSL:50m;
ssl_session_timeout 1d;
ssl_session_tickets off;
ssl_stapling on;
ssl_stapling_verify on;
ssl_buffer_size 4k;
# Optional hardware acceleration
ssl_engine qat;

Memory pool

connection_pool_size 512;
request_pool_size 8k;
large_client_header_buffers 8 16k;
proxy_temp_file_write_size 256k;
proxy_temp_path /var/cache/nginx/proxy_temp levels=1:2 keys_zone=temp:10m;

Stage 5 – Monitoring & Benchmarking

stub_status endpoint

location /nginx_status {
    stub_status on;
    access_log off;
    allow 127.0.0.1;
    deny all;
}

Example monitoring script

#!/bin/bash
# nginx_monitor.sh
curl -s http://localhost/nginx_status | awk '
/Active connections/ {print "active_connections " $3}
/accepts/ {print "accepts " $1; print "handled " $2; print "requests " $3}
/Reading/ {print "reading " $2; print "writing " $4; print "waiting " $6}
' | while read metric value; do
    echo "nginx.${metric}:${value}|g" | nc -u localhost 8125
done

Load‑test commands

# wrk –t32 -c1000 -d60s --latency http://your-domain.com/
# ab –n 100000 -c 1000 http://your-domain.com/
# wrk –t32 -c1000 -d60s -s post.lua http://your-domain.com/api

Extreme Optimization – Million QPS

DPDK kernel bypass

# Build Nginx with DPDK support
./configure --with-dpdk=/path/to/dpdk
# Bind IRQs to CPU cores
echo 2 > /proc/irq/24/smp_affinity
echo 4 > /proc/irq/25/smp_affinity

LuaJIT (OpenResty)

location /api {
    content_by_lua_block {
        ngx.header.content_type = "application/json"
        ngx.say('{"status":"ok"}')
    }
}

Zero‑copy I/O

splice on;
aio threads;
aio_write on;
directio 4m;
directio_alignment 512;

Real‑World Case – Flash‑Sale System

Target: 1.5 M QPS, < 50 ms latency, 99.99 % availability. Achieved 1.68 M QPS with 32 ms average response.

upstream seckill_backend {
    hash $remote_addr consistent;
    server 10.0.1.10:8080 weight=3 max_conns=3000;
    server 10.0.1.11:8080 weight=3 max_conns=3000;
    server 10.0.1.12:8080 weight=4 max_conns=4000;
    keepalive 1000;
}
limit_req_zone $binary_remote_addr zone=seckill:100m rate=100r/s;
limit_conn_zone $binary_remote_addr zone=conn_seckill:100m;
server {
    location /seckill {
        limit_req zone=seckill burst=200 nodelay;
        limit_conn conn_seckill 10;
        proxy_cache seckill_cache;
        proxy_cache_valid 200 302 5s;
        proxy_cache_valid 404 1m;
        proxy_connect_timeout 1s;
        proxy_send_timeout 2s;
        proxy_read_timeout 2s;
        proxy_pass http://seckill_backend;
    }
}

Optimization Checklist

Set worker_processes auto and enable epoll.

Increase worker_connections and tune keep‑alive.

Apply sysctl TCP tweaks and raise fs.file-max.

Configure per‑user limits for nofile and nproc.

Use upstream keepalive pools and least_conn balancing.

Enable gzip and Brotli compression.

Hard‑enforce SSL/TLS settings and enable session cache.

Deploy stub_status and external metrics collection.

Common Pitfalls

Too many worker_processes

Excessive context switches degrade performance. Use worker_processes auto.

Missing upstream keepalive

Backend connection overhead rises. Set appropriate keepalive values.

SSL handshake cost

Enable session cache, OCSP stapling, and hardware acceleration.

Verbose logging

Disk I/O spikes. Disable unnecessary access logs or use asynchronous logging.

Future Directions

HTTP/3 & QUIC

listen 443 quic reuseport;
listen 443 ssl http2;
add_header Alt-Svc 'h3=":443"; ma=86400';

Edge computing

Running Nginx on edge nodes (5G) to achieve sub‑millisecond latency for latency‑sensitive workloads.

AI‑driven auto‑tuning

Upcoming versions may embed machine‑learning models that adjust worker counts, buffers, and cache policies in real time.

OptimizationPerformance TuningLinuxHigh ConcurrencyNginxweb server
Raymond Ops
Written by

Raymond Ops

Linux ops automation, cloud-native, Kubernetes, SRE, DevOps, Python, Golang and related tech discussions.

0 followers
Reader feedback

How this landed with the community

Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.