Fundamentals 53 min read

Understanding Linux System Calls: Core Logic, Mechanisms, and Practical Examples

This comprehensive guide explains Linux system calls—the sole interface between user‑space programs and the kernel—covering their purpose, core logic for process, file, and memory management, the underlying interrupt mechanisms, parameter handling, and real‑world C code examples.

Deepin Linux
Deepin Linux
Deepin Linux
Understanding Linux System Calls: Core Logic, Mechanisms, and Practical Examples

1. Overview of Linux System Calls

System calls are the only legitimate interface for user‑space programs to request services from the kernel. They provide controlled access to hardware resources, process management, memory allocation, and file operations, ensuring security and stability.

2. Core System Calls

2.1 Process‑Management Calls

fork creates a new task_struct, copies most of the parent’s state, assigns a new PID, and uses copy‑on‑write (COW) for memory pages. The kernel implementation relies on do_fork and copy_process.

#include <stdio.h>
#include <unistd.h>
int main() {
    pid_t pid = fork();
    if (pid < 0) {
        perror("fork failed");
    } else if (pid == 0) {
        printf("I am the child, PID = %d, parent PID = %d
", getpid(), getppid());
    } else {
        printf("I am the parent, PID = %d, child PID = %d
", getpid(), pid);
    }
    return 0;
}

execve replaces the current program image with a new executable. The kernel validates the path, checks the ELF format, creates a new memory layout, clears the old user space, and sets the entry point to the new program.

#include <stdio.h>
#include <unistd.h>
int main(){
    char *argv[] = {"/bin/ls", "-l", NULL};
    char *envp[] = {NULL};
    printf("Preparing to exec new program...
");
    execve("/bin/ls", argv, envp);
    perror("execve failed");
    return 0;
}

pipe creates a pair of file descriptors for one‑way communication between related processes.

#include <stdio.h>
#include <unistd.h>
#include <sys/wait.h>
int main(){
    int fd[2];
    char buf[100];
    pipe(fd);
    if (fork() == 0) {
        write(fd[1], "Hello from child", 16);
    } else {
        read(fd[0], buf, sizeof(buf));
        printf("Parent received: %s
", buf);
        wait(NULL);
    }
    return 0;
}

semget / semop provide process synchronization. A typical usage creates a semaphore, performs a P (decrement) operation, enters a critical section, then a V (increment) operation.

#include <stdio.h>
#include <sys/sem.h>
union semun {int val;};
int main(){
    int semid = semget(IPC_PRIVATE, 1, 0666 | IPC_CREAT);
    union semun su = {1};
    semctl(semid, 0, SETVAL, su);
    struct sembuf p = {0, -1, SEM_UNDO};
    struct sembuf v = {0, +1, SEM_UNDO};
    semop(semid, &p, 1);
    printf("Entered critical section
");
    semop(semid, &v, 1);
    semctl(semid, 0, IPC_RMID);
    return 0;
}

shmget / shmat allow multiple processes to map the same physical memory region.

#include <stdio.h>
#include <sys/shm.h>
int main(){
    int shmid = shmget(IPC_PRIVATE, 1024, 0666 | IPC_CREAT);
    char *addr = shmat(shmid, NULL, 0);
    sprintf(addr, "Hello shared memory!");
    printf("Shared memory content: %s
", addr);
    shmdt(addr);
    shmctl(shmid, IPC_RMID, NULL);
    return 0;
}

2.2 File‑System Calls

open validates the pathname, checks permissions, and returns a file descriptor.

#include <fcntl.h>
#include <stdio.h>
int main(){
    int fd = open("test.txt", O_RDWR | O_CREAT | O_TRUNC, 0644);
    if (fd < 0) {
        perror("open failed");
        return -1;
    }
    printf("fd = %d
", fd);
    close(fd);
    return 0;
}

read validates the descriptor, locates the file offset, copies data from the page cache (or triggers I/O), updates the file pointer, and returns the byte count.

#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
int main(){
    char buf[1024] = {0};
    int fd = open("test.txt", O_RDONLY);
    int n = read(fd, buf, sizeof(buf));
    printf("read %d bytes: %s
", n, buf);
    close(fd);
    return 0;
}

write copies user data into the page cache, marks pages dirty, and later flushes them to disk.

#include <unistd.h>
#include <fcntl.h>
int main(){
    int fd = open("test.txt", O_WRONLY | O_CREAT, 0644);
    const char *str = "Hello Linux File System
";
    write(fd, str, strlen(str));
    close(fd);
    return 0;
}

creat is a shortcut for open with O_CREAT | O_WRONLY | O_TRUNC. unlink removes a directory entry and frees the inode when the link count reaches zero. chmod changes file permission bits.

#include <fcntl.h>
int fd = creat("new_file.txt", 0644);
close(fd);
#include <unistd.h>
unlink("test.txt");
#include <sys/stat.h>
chmod("test.txt", 0600);

2.3 Memory‑Management Calls

brk / sbrk move the program break to grow or shrink the heap; they are used by malloc for small allocations.

#include <unistd.h>
#include <stdio.h>
int main(){
    void *curr = sbrk(0);
    printf("Current break: %p
", curr);
    brk(curr + 1024);
    printf("After expansion: %p
", sbrk(0));
    brk(curr);
    printf("After shrink: %p
", sbrk(0));
    return 0;
}

mmap creates a new virtual memory region, optionally mapping a file or allocating anonymous memory.

#include <sys/mman.h>
#include <stdio.h>
int main(){
    void *mem = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
    if (mem == MAP_FAILED) {
        perror("mmap");
        return 1;
    }
    printf("mmap address: %p
", mem);
    sprintf(mem, "hello mmap");
    printf("value: %s
", (char*)mem);
    munmap(mem, 4096);
    return 0;
}

File mapping with mmap lets a process access file contents directly via memory, avoiding extra copies.

#include <sys/mman.h>
#include <fcntl.h>
#include <stdio.h>
int main(){
    int fd = open("test.txt", O_RDWR);
    void *map = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
    sprintf(map, "hello mmap");
    munmap(map, 4096);
    close(fd);
    return 0;
}

3. System‑Call Mechanism

3.1 User‑Space and Kernel‑Space

Modern CPUs provide privilege rings. Ring 0 runs the kernel with full hardware access; Ring 3 runs user programs with restricted permissions. System calls act as a controlled gateway from Ring 3 to Ring 0.

3.2 Interrupt / Syscall Entry

Trigger a software interrupt (historically int 0x80, now syscall).

CPU saves the user context (registers, PC).

Switch privilege level to Ring 0.

Lookup the interrupt vector in the IDT.

Jump to the system‑call handler.

3.3 System‑Call Numbers

Each call has a unique identifier used to index sys_call_table in the kernel.

void *sys_call_table[] = {
    [0] = sys_restart_syscall,
    [1] = sys_exit,
    [2] = sys_fork,
    [3] = sys_read,
    [4] = sys_write,
    // ...
};

3.4 Parameter Passing

Fast‑path arguments are passed in registers (e.g., eax, ebx, ecx, edx on x86). Larger structures are passed via memory buffers. The kernel validates all arguments before use.

4. System‑Call Execution Flow

4.1 Application Request

Library wrappers (e.g., fopen, open) prepare arguments and invoke the underlying system call.

#include <stdio.h>
int main(){
    char buffer[1024];
    FILE *f = fopen("example.txt", "r");
    size_t n = fread(buffer, 1, sizeof(buffer), f);
    fclose(f);
    return 0;
}

4.2 Register Setup and Interrupt

For a read call on x86, the wrapper places the syscall number and arguments in registers and executes int 0x80 (or syscall).

mov eax, 3        ; sys_read
mov ebx, fd       ; file descriptor
mov ecx, buffer   ; destination address
mov edx, 1024     ; byte count
int 0x80          ; trigger soft interrupt

4.3 Kernel Processing

asmlinkage void system_call(void){
    save_context();
    sys_call_table[eax]();
    restore_context();
}

4.4 Return to User Space

After the kernel finishes, it restores the saved registers, switches back to Ring 3, and the wrapper interprets the return value, setting errno on error.

#include <stdio.h>
#include <errno.h>
int main(){
    char buf[1024];
    FILE *f = fopen("nonexist.txt", "r");
    if (!f) {
        printf("Open failed, error: %d
", errno);
        return -1;
    }
    fread(buf, 1, 1024, f);
    fclose(f);
    return 0;
}

5. Representative Case Studies

Case 1 – Process Management

#include <stdio.h>
#include <unistd.h>
#include <sys/wait.h>
int main(){
    pid_t pid = fork();
    if (pid == 0) {
        printf("Child PID %d
", getpid());
        execl("/bin/ls", "ls", "-l", NULL);
        perror("execl");
        exit(1);
    } else if (pid > 0) {
        printf("Parent PID %d
", getpid());
        int status;
        wait(&status);
        if (WIFEXITED(status))
            printf("Child exited with %d
", WEXITSTATUS(status));
    }
    return 0;
}

Case 2 – File I/O

#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
int main(){
    int fd = open("test.txt", O_RDONLY);
    if (fd == -1) return 1;
    char buf[1024];
    ssize_t n = read(fd, buf, sizeof(buf));
    close(fd);
    fd = open("output.txt", O_WRONLY|O_CREAT, 0644);
    if (fd == -1) return 1;
    write(fd, buf, n);
    close(fd);
    return 0;
}

Case 3 – Memory Management

#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
int main(){
    // brk via sbrk
    void *start = sbrk(0);
    sbrk(1024);
    printf("Heap after +1024: %p
", sbrk(0));
    strcpy(start, "brk test");
    printf("Heap content: %s
", (char*)start);
    sbrk(-1024);
    // mmap anonymous
    void *addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
    if (addr == MAP_FAILED) return 1;
    strcpy(addr, "mmap test");
    printf("mmap content: %s
", (char*)addr);
    munmap(addr, 4096);
    return 0;
}
System call diagram
System call diagram
memory managementKernelProcess ManagementLinuxC programmingsystem calls
Deepin Linux
Written by

Deepin Linux

Research areas: Windows & Linux platforms, C/C++ backend development, embedded systems and Linux kernel, etc.

0 followers
Reader feedback

How this landed with the community

Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.