Understanding Linux System Calls: Core Logic, Mechanisms, and Practical Examples
This comprehensive guide explains Linux system calls—the sole interface between user‑space programs and the kernel—covering their purpose, core logic for process, file, and memory management, the underlying interrupt mechanisms, parameter handling, and real‑world C code examples.
1. Overview of Linux System Calls
System calls are the only legitimate interface for user‑space programs to request services from the kernel. They provide controlled access to hardware resources, process management, memory allocation, and file operations, ensuring security and stability.
2. Core System Calls
2.1 Process‑Management Calls
fork creates a new task_struct, copies most of the parent’s state, assigns a new PID, and uses copy‑on‑write (COW) for memory pages. The kernel implementation relies on do_fork and copy_process.
#include <stdio.h>
#include <unistd.h>
int main() {
pid_t pid = fork();
if (pid < 0) {
perror("fork failed");
} else if (pid == 0) {
printf("I am the child, PID = %d, parent PID = %d
", getpid(), getppid());
} else {
printf("I am the parent, PID = %d, child PID = %d
", getpid(), pid);
}
return 0;
}execve replaces the current program image with a new executable. The kernel validates the path, checks the ELF format, creates a new memory layout, clears the old user space, and sets the entry point to the new program.
#include <stdio.h>
#include <unistd.h>
int main(){
char *argv[] = {"/bin/ls", "-l", NULL};
char *envp[] = {NULL};
printf("Preparing to exec new program...
");
execve("/bin/ls", argv, envp);
perror("execve failed");
return 0;
}pipe creates a pair of file descriptors for one‑way communication between related processes.
#include <stdio.h>
#include <unistd.h>
#include <sys/wait.h>
int main(){
int fd[2];
char buf[100];
pipe(fd);
if (fork() == 0) {
write(fd[1], "Hello from child", 16);
} else {
read(fd[0], buf, sizeof(buf));
printf("Parent received: %s
", buf);
wait(NULL);
}
return 0;
}semget / semop provide process synchronization. A typical usage creates a semaphore, performs a P (decrement) operation, enters a critical section, then a V (increment) operation.
#include <stdio.h>
#include <sys/sem.h>
union semun {int val;};
int main(){
int semid = semget(IPC_PRIVATE, 1, 0666 | IPC_CREAT);
union semun su = {1};
semctl(semid, 0, SETVAL, su);
struct sembuf p = {0, -1, SEM_UNDO};
struct sembuf v = {0, +1, SEM_UNDO};
semop(semid, &p, 1);
printf("Entered critical section
");
semop(semid, &v, 1);
semctl(semid, 0, IPC_RMID);
return 0;
}shmget / shmat allow multiple processes to map the same physical memory region.
#include <stdio.h>
#include <sys/shm.h>
int main(){
int shmid = shmget(IPC_PRIVATE, 1024, 0666 | IPC_CREAT);
char *addr = shmat(shmid, NULL, 0);
sprintf(addr, "Hello shared memory!");
printf("Shared memory content: %s
", addr);
shmdt(addr);
shmctl(shmid, IPC_RMID, NULL);
return 0;
}2.2 File‑System Calls
open validates the pathname, checks permissions, and returns a file descriptor.
#include <fcntl.h>
#include <stdio.h>
int main(){
int fd = open("test.txt", O_RDWR | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
perror("open failed");
return -1;
}
printf("fd = %d
", fd);
close(fd);
return 0;
}read validates the descriptor, locates the file offset, copies data from the page cache (or triggers I/O), updates the file pointer, and returns the byte count.
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
int main(){
char buf[1024] = {0};
int fd = open("test.txt", O_RDONLY);
int n = read(fd, buf, sizeof(buf));
printf("read %d bytes: %s
", n, buf);
close(fd);
return 0;
}write copies user data into the page cache, marks pages dirty, and later flushes them to disk.
#include <unistd.h>
#include <fcntl.h>
int main(){
int fd = open("test.txt", O_WRONLY | O_CREAT, 0644);
const char *str = "Hello Linux File System
";
write(fd, str, strlen(str));
close(fd);
return 0;
}creat is a shortcut for open with O_CREAT | O_WRONLY | O_TRUNC. unlink removes a directory entry and frees the inode when the link count reaches zero. chmod changes file permission bits.
#include <fcntl.h>
int fd = creat("new_file.txt", 0644);
close(fd); #include <unistd.h>
unlink("test.txt"); #include <sys/stat.h>
chmod("test.txt", 0600);2.3 Memory‑Management Calls
brk / sbrk move the program break to grow or shrink the heap; they are used by malloc for small allocations.
#include <unistd.h>
#include <stdio.h>
int main(){
void *curr = sbrk(0);
printf("Current break: %p
", curr);
brk(curr + 1024);
printf("After expansion: %p
", sbrk(0));
brk(curr);
printf("After shrink: %p
", sbrk(0));
return 0;
}mmap creates a new virtual memory region, optionally mapping a file or allocating anonymous memory.
#include <sys/mman.h>
#include <stdio.h>
int main(){
void *mem = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (mem == MAP_FAILED) {
perror("mmap");
return 1;
}
printf("mmap address: %p
", mem);
sprintf(mem, "hello mmap");
printf("value: %s
", (char*)mem);
munmap(mem, 4096);
return 0;
}File mapping with mmap lets a process access file contents directly via memory, avoiding extra copies.
#include <sys/mman.h>
#include <fcntl.h>
#include <stdio.h>
int main(){
int fd = open("test.txt", O_RDWR);
void *map = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
sprintf(map, "hello mmap");
munmap(map, 4096);
close(fd);
return 0;
}3. System‑Call Mechanism
3.1 User‑Space and Kernel‑Space
Modern CPUs provide privilege rings. Ring 0 runs the kernel with full hardware access; Ring 3 runs user programs with restricted permissions. System calls act as a controlled gateway from Ring 3 to Ring 0.
3.2 Interrupt / Syscall Entry
Trigger a software interrupt (historically int 0x80, now syscall).
CPU saves the user context (registers, PC).
Switch privilege level to Ring 0.
Lookup the interrupt vector in the IDT.
Jump to the system‑call handler.
3.3 System‑Call Numbers
Each call has a unique identifier used to index sys_call_table in the kernel.
void *sys_call_table[] = {
[0] = sys_restart_syscall,
[1] = sys_exit,
[2] = sys_fork,
[3] = sys_read,
[4] = sys_write,
// ...
};3.4 Parameter Passing
Fast‑path arguments are passed in registers (e.g., eax, ebx, ecx, edx on x86). Larger structures are passed via memory buffers. The kernel validates all arguments before use.
4. System‑Call Execution Flow
4.1 Application Request
Library wrappers (e.g., fopen, open) prepare arguments and invoke the underlying system call.
#include <stdio.h>
int main(){
char buffer[1024];
FILE *f = fopen("example.txt", "r");
size_t n = fread(buffer, 1, sizeof(buffer), f);
fclose(f);
return 0;
}4.2 Register Setup and Interrupt
For a read call on x86, the wrapper places the syscall number and arguments in registers and executes int 0x80 (or syscall).
mov eax, 3 ; sys_read
mov ebx, fd ; file descriptor
mov ecx, buffer ; destination address
mov edx, 1024 ; byte count
int 0x80 ; trigger soft interrupt4.3 Kernel Processing
asmlinkage void system_call(void){
save_context();
sys_call_table[eax]();
restore_context();
}4.4 Return to User Space
After the kernel finishes, it restores the saved registers, switches back to Ring 3, and the wrapper interprets the return value, setting errno on error.
#include <stdio.h>
#include <errno.h>
int main(){
char buf[1024];
FILE *f = fopen("nonexist.txt", "r");
if (!f) {
printf("Open failed, error: %d
", errno);
return -1;
}
fread(buf, 1, 1024, f);
fclose(f);
return 0;
}5. Representative Case Studies
Case 1 – Process Management
#include <stdio.h>
#include <unistd.h>
#include <sys/wait.h>
int main(){
pid_t pid = fork();
if (pid == 0) {
printf("Child PID %d
", getpid());
execl("/bin/ls", "ls", "-l", NULL);
perror("execl");
exit(1);
} else if (pid > 0) {
printf("Parent PID %d
", getpid());
int status;
wait(&status);
if (WIFEXITED(status))
printf("Child exited with %d
", WEXITSTATUS(status));
}
return 0;
}Case 2 – File I/O
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
int main(){
int fd = open("test.txt", O_RDONLY);
if (fd == -1) return 1;
char buf[1024];
ssize_t n = read(fd, buf, sizeof(buf));
close(fd);
fd = open("output.txt", O_WRONLY|O_CREAT, 0644);
if (fd == -1) return 1;
write(fd, buf, n);
close(fd);
return 0;
}Case 3 – Memory Management
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
int main(){
// brk via sbrk
void *start = sbrk(0);
sbrk(1024);
printf("Heap after +1024: %p
", sbrk(0));
strcpy(start, "brk test");
printf("Heap content: %s
", (char*)start);
sbrk(-1024);
// mmap anonymous
void *addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) return 1;
strcpy(addr, "mmap test");
printf("mmap content: %s
", (char*)addr);
munmap(addr, 4096);
return 0;
}Deepin Linux
Research areas: Windows & Linux platforms, C/C++ backend development, embedded systems and Linux kernel, etc.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
