Why TCP’s Three‑Way Handshake Matters: Deep Dive into States, Tuning, and Real‑World Pitfalls
This article explains the TCP three‑way handshake in depth, covering the state machine, kernel‑level packet analysis, performance tuning, security hardening, real‑world case studies such as SYN‑Flood mitigation and TIME_WAIT overload, and provides complete C and Python examples, monitoring metrics, troubleshooting steps, and backup procedures for production environments.
TCP Three‑Way Handshake Overview
The TCP three‑way handshake establishes a reliable connection by exchanging three packets:
Client (CLOSED) Server (LISTEN)
| |
| SYN, seq=x |
|------------------------------>|
| | (SYN_RCVD)
| SYN+ACK, seq=y, ack=x+1 |
|<------------------------------|
| ACK, seq=x+1, ack=y+1 |
|------------------------------>|Key fields:
SYN: SYN=1, ACK=0, seq=ISN SYN+ACK: SYN=1, ACK=1, seq=ISN_server, ack=ISN_client+1 ACK:
SYN=0, ACK=1, seq=ISN_client+1, ack=ISN_server+1State Machine
Typical state transitions for a client and a server are:
Client: CLOSED → SYN_SENT → ESTABLISHED
Server: LISTEN → SYN_RCVD → ESTABLISHEDAfter data exchange the connection moves through FIN_WAIT_1 → FIN_WAIT_2 → TIME_WAIT → CLOSED. The TIME_WAIT state lasts for 2 MSL (maximum segment lifetime) to ensure all delayed packets are discarded.
Kernel Parameters for Handshake Tuning
# /etc/sysctl.d/99-tcp-tuning.conf
net.ipv4.tcp_max_syn_backlog = 8192 # half‑open queue size
net.core.somaxconn = 4096 # max listen backlog
net.ipv4.tcp_syn_retries = 3 # client SYN retries
net.ipv4.tcp_synack_retries = 2 # server SYN+ACK retries
net.ipv4.tcp_syncookies = 1 # enable SYN‑Cookies (DDoS protection)
net.ipv4.tcp_tw_reuse = 1 # allow reuse of TIME_WAIT sockets (client side)
net.ipv4.tcp_tw_recycle = 0 # deprecated, keep disabled
net.ipv4.tcp_fin_timeout = 30 # FIN_WAIT_2 timeout (seconds)
net.ipv4.tcp_fastopen = 3 # enable TCP Fast Open for both endsApply the settings with:
sudo sysctl -p /etc/sysctl.d/99-tcp-tuning.confC Example: Simple TCP Server
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <errno.h>
#define PORT 8080
#define BACKLOG 128
int main() {
int listen_fd = socket(AF_INET, SOCK_STREAM, 0);
if (listen_fd < 0) { perror("socket failed"); exit(EXIT_FAILURE); }
int reuse = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse));
int defer = 5; // TCP_DEFER_ACCEPT 5 s
setsockopt(listen_fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, &defer, sizeof(defer));
struct sockaddr_in server_addr = {0};
server_addr.sin_family = AF_INET;
server_addr.sin_addr.s_addr = INADDR_ANY;
server_addr.sin_port = htons(PORT);
bind(listen_fd, (struct sockaddr*)&server_addr, sizeof(server_addr));
listen(listen_fd, BACKLOG);
printf("Server listening on port %d (backlog=%d)
", PORT, BACKLOG);
while (1) {
struct sockaddr_in client_addr;
socklen_t client_len = sizeof(client_addr);
int conn_fd = accept(listen_fd, (struct sockaddr*)&client_addr, &client_len);
if (conn_fd < 0) { perror("accept failed"); continue; }
printf("Connection from %s:%d
", inet_ntoa(client_addr.sin_addr), ntohs(client_addr.sin_port));
char buffer[1024];
ssize_t n = read(conn_fd, buffer, sizeof(buffer)-1);
if (n > 0) { buffer[n] = '\0'; printf("Received: %s
", buffer); write(conn_fd, "ACK
", 4); }
close(conn_fd);
}
close(listen_fd);
return 0;
}C Example: Simple TCP Client with Timing
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <errno.h>
#include <sys/time.h>
#define SERVER_IP "127.0.0.1"
#define SERVER_PORT 8080
int main() {
int sock_fd = socket(AF_INET, SOCK_STREAM, 0);
if (sock_fd < 0) { perror("socket failed"); exit(EXIT_FAILURE); }
int nodelay = 1;
setsockopt(sock_fd, IPPROTO_TCP, TCP_NODELAY, &nodelay, sizeof(nodelay));
int quickack = 1;
setsockopt(sock_fd, IPPROTO_TCP, TCP_QUICKACK, &quickack, sizeof(quickack));
struct sockaddr_in server_addr = {0};
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(SERVER_PORT);
inet_pton(AF_INET, SERVER_IP, &server_addr.sin_addr);
struct timeval start, end;
gettimeofday(&start, NULL);
if (connect(sock_fd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) { perror("connect failed"); exit(EXIT_FAILURE); }
gettimeofday(&end, NULL);
double elapsed = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;
printf("Connection established in %.2f ms
", elapsed);
const char *msg = "Hello, TCP!";
write(sock_fd, msg, strlen(msg));
char buffer[1024];
ssize_t n = read(sock_fd, buffer, sizeof(buffer)-1);
if (n > 0) { buffer[n] = '\0'; printf("Server response: %s
", buffer); }
close(sock_fd);
return 0;
}Python TCP Fast Open Example
import socket
# Server side (requires Linux kernel >= 3.7)
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.setsockopt(socket.IPPROTO_TCP, socket.TCP_FASTOPEN, 5) # backlog for TFO cookies
server.bind(('0.0.0.0', 8080))
server.listen(128)
# Client side – send data together with SYN using MSG_FASTOPEN
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client.sendto(b'GET / HTTP/1.1
Host: example.com
',
socket.MSG_FASTOPEN,
('127.0.0.1', 8080))SYN Flood Mitigation
Typical production steps:
Enable SYN Cookies: sudo sysctl -w net.ipv4.tcp_syncookies=1 Enlarge the half‑open queue:
sudo sysctl -w net.ipv4.tcp_max_syn_backlog=16384Reduce SYN+ACK retries to free resources quickly: sudo sysctl -w net.ipv4.tcp_synack_retries=1 Rate‑limit new SYN packets with iptables:
sudo iptables -A INPUT -p tcp --syn -m limit --limit 10/s --limit-burst 20 -j ACCEPT
sudo iptables -A INPUT -p tcp --syn -j DROPTIME_WAIT Exhaustion Handling
High‑frequency short connections can fill the TIME_WAIT table, causing "Cannot assign requested address" errors. Mitigation strategies:
Enable reuse on the client side: sudo sysctl -w net.ipv4.tcp_tw_reuse=1 Make sure timestamps are enabled (required for reuse): sudo sysctl -w net.ipv4.tcp_timestamps=1 Reduce FIN_WAIT_2 timeout: sudo sysctl -w net.ipv4.tcp_fin_timeout=15 Use connection pooling at the application layer (e.g., requests.adapters.HTTPAdapter in Python) to keep long‑lived sockets.
Performance Monitoring (Prometheus / Node Exporter)
# Example queries
node_netstat_Tcp_CurrEstab # ESTABLISHED connections
node_netstat_Tcp_AttemptFails # connection attempts that failed
rate(node_netstat_Tcp_AttemptFails[5m]) / rate(node_netstat_Tcp_ActiveOpens[5m]) > 0.01 # failure‑rate alert
node_netstat_TcpExt_TCPSynRetrans # SYN retransmissions
node_netstat_TcpExt_ListenOverflows # listen queue overflows
node_netstat_Tcp_CurrEstab{state="time-wait"} > 30000 # TIME_WAIT overloadTroubleshooting Workflow
Measure handshake latency: time nc -zv 192.168.1.100 80 Capture packets with tcpdump and verify SYN/SYN+ACK/ACK sequence:
sudo tcpdump -i any -nn 'tcp[tcpflags] & tcp-syn != 0' -w handshake.pcapCheck kernel logs for TCP errors:
sudo dmesg | grep -i tcp
sudo journalctl -k | grep -i tcpInspect socket states with ss:
ss -tanp state close-wait # detect lingering CLOSE_WAIT sockets
ss -tan state time-wait | wc -l # count TIME_WAIT socketsUse strace to trace system calls of the server or client if needed.
Backup and Restore Script
#!/bin/bash
# backup_tcp_config.sh
BACKUP_DIR="/data/backups/tcp_config"
DATE=$(date +%Y%m%d)
mkdir -p "$BACKUP_DIR"
# Backup sysctl TCP parameters
sysctl -a | grep -E "tcp|net.core" > "$BACKUP_DIR/sysctl_$DATE.conf"
# Backup iptables rules
sudo iptables-save > "$BACKUP_DIR/iptables_$DATE.rules"
# Backup application config (example: nginx)
tar -czf "$BACKUP_DIR/nginx_$DATE.tar.gz" /etc/nginx/
# Cleanup old backups (keep 30 days)
find "$BACKUP_DIR" -name "*.conf" -mtime +30 -deleteRestore steps:
Reload sysctl parameters:
sudo sysctl -p /data/backups/tcp_config/sysctl_20240115.confRestore firewall rules:
sudo iptables-restore < /data/backups/tcp_config/iptables_20240115.rulesVerify connectivity:
nc -zv localhost 80
ab -n 10000 -c 100 http://localhost/Best‑Practice Checklist
Size tcp_max_syn_backlog and somaxconn according to expected concurrent connections.
Enable tcp_syncookies during DDoS attacks; disable when not needed to keep TCP options.
Turn on tcp_fastopen for latency‑sensitive services.
Prefer long‑lived connections or connection pools to reduce handshake overhead.
Monitor the key metrics listed above and set alerts for failure‑rate, SYN retransmissions, listen‑queue overflow, and TIME_WAIT count.
Raymond Ops
Linux ops automation, cloud-native, Kubernetes, SRE, DevOps, Python, Golang and related tech discussions.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
