Backend Development 14 min read

Zero‑Downtime Restarts: Transfer Open Sockets via Unix Domain Sockets in Go

This article explains how to achieve graceful, zero‑downtime restarts for long‑living TCP services by passing open file descriptors through a Unix domain socket, allowing new processes to inherit existing connections without interrupting clients, and provides a complete Go demo implementation.

360 Zhihui Cloud Developer
360 Zhihui Cloud Developer
360 Zhihui Cloud Developer
Zero‑Downtime Restarts: Transfer Open Sockets via Unix Domain Sockets in Go

Background

While working on long‑living connections, the author found that traditional deployments are painful because each restart forces all connections to be closed, leading to long downtime. The goal is to perform an elegant restart where clients remain unaware of the deployment.

Challenges

How to keep receiving new connections without interruption.

How to preserve already‑established connections during a restart.

Solution Overview

The key is to use a Unix domain socket to transfer open file descriptors (FDs) from the old process to the new one. Since Unix sockets work only on the same host and are based on the file system, they can carry FDs between processes, similar to the inheritance that occurs after a fork .

Linux bind Insight

Examining the Linux 1.0 source shows that sock_bind looks up the socket structure from the file descriptor, then calls the protocol‑specific bind function. The sock_array is a chained hash table that stores sockets by port, and bind checks for address/port validity and conflicts (e.g., EADDRINUSE ).

<code>static int sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
    struct socket *sock;
    int i;
    DPRINTF((net_debug, "NET: sock_bind: fd = %d\n", fd));
    if (fd < 0 || fd >= NR_OPEN || current->filp[fd] == NULL)
        return -EBADF;
    if (!(sock = sockfd_lookup(fd, NULL)))
        return -ENOTSOCK;
    if ((i = sock->ops->bind(sock, umyaddr, addrlen)) < 0)
        return i;
    return 0;
}

static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
    ...
    for (sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE-1)];
         sk2 != NULL; sk2 = sk2->next) {
        if (sk2->num != snum) continue;
        if (sk2->dead) { destroy_sock(sk2); goto outside_loop; }
        if (!sk->reuse) return -EADDRINUSE;
        if (sk2->saddr != sk->saddr) continue;
        if (!sk2->reuse) return -EADDRINUSE;
    }
    ...
}
</code>

Achieving Non‑Interrupting Accept

By checking sock_array during bind , the kernel can detect address conflicts. When a new process inherits the listening socket, it can continue accepting new connections without dropping existing ones.

Preserving Existing Connections

The old process sends its open connection FDs and their state through a Unix domain socket to the new process. The new process reconstructs net.Conn objects from the received FDs, allowing ongoing communication to continue.

<code>#include &lt;sys/types.h&gt;
#include &lt;sys/socket.h&gt;

ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
</code>

Unix sockets are used for inter‑process communication on a single host, and sendmsg / recvmsg can transfer file descriptors.

Demo Implementation (Go)

The following Go program demonstrates a server that can gracefully restart by transferring its active connections to a newly spawned process via a Unix socket. It handles signals, forks a new binary, passes FDs, and shuts down the old listener.

<code>package main

import (
    "flag"
    "fmt"
    "golang.org/x/sys/unix"
    "log"
    "net"
    "os"
    "os/signal"
    "path/filepath"
    "sync"
    "syscall"
    "time"
)

var (
    workSpace string
    logger    *log.Logger
    writeTimeout = time.Second * 5
    readTimeout  = time.Second * 5
    signalChan = make(chan os.Signal)
    connFiles sync.Map
    serverListener net.Listener
    isUpdate = false
)

func init() {
    flag.StringVar(&workSpace, "w", ".", "Usage:\n ./server -w=workspace")
    flag.Parse()
    file, err := os.OpenFile(filepath.Join(workSpace, "server.log"), os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0777)
    if err != nil { panic(err) }
    logger = log.New(file, "", log.LstdFlags)
    go beforeStart()
    go signalHandler()
}

func main() {
    var err error
    serverListener, err = net.Listen("tcp", ":7000")
    if err != nil { panic(err) }
    for {
        if isUpdate { continue }
        conn, err := serverListener.Accept()
        if err != nil { logger.Println("conn error"); continue }
        c := conn.(*net.TCPConn)
        go connectionHandler(c)
    }
}

func connectionHandler(conn *net.TCPConn) {
    file, _ := conn.File()
    connFiles.Store(file, true)
    logger.Printf("conn fd %d\n", file.Fd())
    defer func(){ connFiles.Delete(file); _ = conn.Close() }()
    for {
        if isUpdate { continue }
        if err := conn.SetReadDeadline(time.Now().Add(readTimeout)); err != nil { logger.Println(err.Error()); return }
        rBuf := make([]byte, 4)
        if _, err := conn.Read(rBuf); err != nil { logger.Println(err.Error()); return }
        if string(rBuf) != "ping" { logger.Println("failed to parse the message " + string(rBuf)); return }
        if err := conn.SetWriteDeadline(time.Now().Add(writeTimeout)); err != nil { logger.Println(err.Error()); return }
        if _, err := conn.Write([]byte(`pong`)); err != nil { logger.Println(err.Error()); return }
    }
}

func beforeStart() {
    connInterface, err := net.Dial("unix", filepath.Join(workSpace, "conn.sock"))
    if err != nil { logger.Println(err.Error()); return }
    defer connInterface.Close()
    unixConn := connInterface.(*net.UnixConn)
    b := make([]byte, 1)
    oob := make([]byte, 32)
    for {
        if err = unixConn.SetWriteDeadline(time.Now().Add(time.Minute * 3)); err != nil { fmt.Println(err.Error()); return }
        n, oobn, _, _, err := unixConn.ReadMsgUnix(b, oob)
        if err != nil { logger.Println(err.Error()); return }
        if n != 1 || b[0] != 0 { return }
        scms, err := unix.ParseSocketControlMessage(oob[0:oobn])
        if err != nil { logger.Println(err.Error()); return }
        if len(scms) != 1 { logger.Printf("recv fd num != 1 : %d\n", len(scms)); return }
        fds, err := unix.ParseUnixRights(&scms[0])
        if err != nil { logger.Println(err.Error()); return }
        if len(fds) != 1 { logger.Printf("recv fd num != 1 : %d\n", len(fds)); return }
        logger.Printf("recv fd %d\n", fds[0])
        file := os.NewFile(uintptr(fds[0]), "fd-from-old")
        conn, err := net.FileConn(file)
        if err != nil { logger.Println(err.Error()); return }
        go connectionHandler(conn.(*net.TCPConn))
    }
}

func signalHandler() {
    signal.Notify(signalChan, syscall.SIGUSR2)
    for {
        sc := <-signalChan
        if sc == syscall.SIGUSR2 { gracefulExit() }
    }
}

func gracefulExit() {
    var connWait sync.WaitGroup
    _ = syscall.Unlink(filepath.Join(workSpace, "conn.sock"))
    listenerInterface, err := net.Listen("unix", filepath.Join(workSpace, "conn.sock"))
    if err != nil { logger.Println(err.Error()); return }
    defer listenerInterface.Close()
    unixListener := listenerInterface.(*net.UnixListener)
    connWait.Add(1)
    go func() {
        defer connWait.Done()
        unixConn, err := unixListener.AcceptUnix()
        if err != nil { logger.Println(err.Error()); return }
        defer unixConn.Close()
        connFiles.Range(func(key, value interface{}) bool {
            if key == nil || value == nil { return false }
            file := key.(*os.File)
            defer file.Close()
            buf := []byte{0}
            rights := syscall.UnixRights(int(file.Fd()))
            _, _, err := unixConn.WriteMsgUnix(buf, rights, nil)
            if err != nil { logger.Println(err.Error()) }
            logger.Printf("send fd %d\n", file.Fd())
            return true
        })
        finish := []byte{1}
        _, _, err = unixConn.WriteMsgUnix(finish, nil, nil)
        if err != nil { logger.Println(err.Error()) }
    }()
    isUpdate = true
    execSpec := &syscall.ProcAttr{Env: os.Environ(), Files: []uintptr{os.Stdin.Fd(), os.Stdout.Fd(), os.Stderr.Fd()}}
    pid, err := syscall.ForkExec(os.Args[0], os.Args, execSpec)
    if err != nil { logger.Println(err.Error()); return }
    logger.Printf("old process %d new process %d\n", os.Getpid(), pid)
    _ = serverListener.Close()
    connWait.Wait()
    os.Exit(0)
}
</code>

A minimal client can be used to verify the server behavior:

<code>package main

import (
    "fmt"
    "net"
    "time"
)

var (
    writeTimeout = time.Second * 5
    readTimeout  = time.Second * 5
)

func main() {
    conn, err := net.Dial("tcp", "127.0.0.1:7000")
    if err != nil { panic(err) }
    defer conn.Close()
    for {
        time.Sleep(time.Second)
        if err := conn.SetWriteDeadline(time.Now().Add(writeTimeout)); err != nil { fmt.Println(err.Error()); break }
        fmt.Println("send ping")
        if _, err = conn.Write([]byte(`ping`)); err != nil { fmt.Println(err.Error()); break }
        if err := conn.SetReadDeadline(time.Now().Add(readTimeout)); err != nil { fmt.Println(err.Error()); break }
        rBuf := make([]byte, 4)
        if _, err = conn.Read(rBuf); err != nil { fmt.Println(err.Error()) }
        fmt.Println("recv " + string(rBuf))
    }
}
</code>

By using the Unix socket FD transfer technique, the two initial problems—maintaining uninterrupted acceptance of new connections and preserving existing connections across restarts—are solved.

backendGozero downtimeGraceful Restartunix socketfd transfer
360 Zhihui Cloud Developer
Written by

360 Zhihui Cloud Developer

360 Zhihui Cloud is an enterprise open service platform that aims to "aggregate data value and empower an intelligent future," leveraging 360's extensive product and technology resources to deliver platform services to customers.

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.