How Linux Kernel Builds Slab Caches: A Deep Dive into kmem_cache Creation
This article walks through the Linux kernel's slab allocator, explaining how kmem_cache and kmem_cache_node structures are created, how slab objects are laid out in memory, how the required number of pages is calculated, and how the allocator is bootstrapped to avoid circular dependencies.
1. Starting the Code Walkthrough
In the previous article we described the overall architecture of slab caches and their allocation/release principles. Now we will examine the actual kernel source (Linux 5.4) that implements slab cache creation, starting from the kmem_cache_create interface.
2. kmem_cache_create Interface
The entry point is:
struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, ctor); }The parameters specify the cache name, object size, alignment, flags, and an optional constructor.
3. kmem_cache_create_usercopy
struct kmem_cache *kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { struct kmem_cache *s = NULL; const char *cache_name; int err; /* acquire global locks */ get_online_cpus(); get_online_mems(); memcg_get_cache_ids(); mutex_lock(&slab_mutex); /* sanity checks */ err = kmem_cache_sanity_check(name, size); if (err) goto out_unlock; /* flag validation */ if (flags & ~SLAB_FLAGS_PERMITTED) { err = -EINVAL; goto out_unlock; } flags &= CACHE_CREATE_MASK; /* useroffset/usersize validation */ if (WARN_ON(!usersize && useroffset) || WARN_ON(size < usersize || size - usersize < useroffset)) usersize = useroffset = 0; /* reuse existing cache if possible */ if (!usersize) s = __kmem_cache_alias(name, size, align, flags, ctor); if (s) goto out_unlock; /* allocate cache name string */ cache_name = kstrdup_const(name, GFP_KERNEL); if (!cache_name) { err = -ENOMEM; goto out_unlock; } /* create new cache */ s = create_cache(cache_name, size, calculate_alignment(flags, align, size), flags, useroffset, usersize, ctor, NULL, NULL); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); } out_unlock: mutex_unlock(&slab_mutex); memcg_put_cache_ids(); put_online_mems(); put_online_cpus(); if (err) { if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d
", name, err); else { pr_warn("kmem_cache_create(%s) failed with error %d
", name, err); dump_stack(); } return NULL; } return s; }This function performs sanity checks, validates flags, tries to reuse an existing cache via __kmem_cache_alias, and finally calls create_cache to allocate a new slab cache.
4. create_cache – Building the Slab Skeleton
static struct kmem_cache *create_cache(const char *name, unsigned int object_size, unsigned int align, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; /* allocate kmem_cache structure from its own cache */ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); /* initialise core fields */ s->name = name; s->size = s->object_size = object_size; s->align = align; s->ctor = ctor; s->useroffset = useroffset; s->usersize = usersize; /* initialise the rest of the cache */ err = __kmem_cache_create(s, flags); if (err) goto out_free_cache; s->refcount = 1; list_add(&s->list, &slab_caches); out: if (err) return ERR_PTR(err); return s; out_free_cache: kmem_cache_free(kmem_cache, s); goto out; }The cache structure itself is allocated from the special kmem_cache cache (which will be bootstrapped later). After filling the basic fields, the heavy lifting is done by __kmem_cache_create.
5. __kmem_cache_create – Core Initialisation
int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { int err; err = kmem_cache_open(s, flags); if (err) return err; if (slab_state <= UP) return 0; err = sysfs_slab_add(s); if (err) __kmem_cache_release(s); return err; }This function creates the per‑CPU and per‑NUMA structures, calculates object layout, and finally registers the cache in /sys/kernel/slab.
6. kmem_cache_open – Building the Cache Skeleton
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { /* calculate object layout */ if (!calculate_sizes(s, -1)) goto error; /* set min_partial */ set_min_partial(s, ilog2(s->size) / 2); /* set cpu_partial */ set_cpu_partial(s); /* allocate per‑node structures */ if (!init_kmem_cache_nodes(s)) goto error; /* allocate per‑cpu structures */ if (alloc_kmem_cache_cpus(s)) return 0; error: return -ENOMEM; }The function calls several helpers to compute the final object size, alignment, and to allocate the NUMA node caches and per‑CPU caches.
6.1 calculate_sizes – Object Layout
static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; unsigned int order; /* align to word size */ size = ALIGN(size, sizeof(void *)); #ifdef CONFIG_SLUB_DEBUG if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && !s->ctor) s->flags |= __OBJECT_POISON; else s->flags &= ~__OBJECT_POISON; if ((flags & SLAB_RED_ZONE) && size == s->object_size) size += sizeof(void *); #endif s->inuse = size; if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor) { s->offset = size; size += sizeof(void *); } #ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_STORE_USER) size += 2 * sizeof(struct track); #ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_RED_ZONE) { size += sizeof(void *); s->red_left_pad = sizeof(void *); s->red_left_pad = ALIGN(s->red_left_pad, s->align); size += s->red_left_pad; } #endif size = ALIGN(size, s->align); s->size = size; if (forced_order >= 0) order = forced_order; else order = calculate_order(size); if ((int)order < 0) return 0; s->allocflags = 0; if (order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; if (s->flags & SLAB_CACHE_DMA32) s->allocflags |= GFP_DMA32; if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; s->oo = oo_make(order, size); s->min = oo_make(get_order(size), size); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; return !!oo_objects(s->oo); }This routine aligns the object size, adds optional red‑zone or poisoning bytes, reserves space for a free‑pointer, and finally computes the number of pages ( order) needed for a slab.
6.2 calculate_order – Determining Page Count
static unsigned int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); unsigned int max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); while (min_objects > 1) { unsigned int fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, slub_max_order, fraction); if (order <= slub_max_order) return order; fraction /= 2; } min_objects--; } order = slab_order(size, 1, slub_max_order, 1); if (order <= slub_max_order) return order; order = slab_order(size, 1, MAX_ORDER, 1); if (order < MAX_ORDER) return order; return -ENOSYS; }The function tries to minimise fragmentation while respecting a minimum number of objects per slab. If necessary it relaxes the fragmentation fraction and finally falls back to a single‑object slab.
6.3 slab_order – Fragmentation Check
#define MAX_OBJS_PER_PAGE 32767 static unsigned int slab_order(unsigned int size, unsigned int min_objects, unsigned int max_order, unsigned int fract_leftover) { unsigned int min_order = slub_min_order; unsigned int order; for (order = max(min_order, (unsigned)get_order(min_objects * size)); order <= max_order; order++) { unsigned int slab_size = (unsigned)PAGE_SIZE << order; unsigned int rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; } return order; }It returns the smallest order where the leftover memory after fitting objects does not exceed slab_size / fract_leftover.
7. set_min_partial – Node Cache Limits
#define MIN_PARTIAL 5 #define MAX_PARTIAL 10 static void set_min_partial(struct kmem_cache *s, unsigned long min) { if (min < MIN_PARTIAL) min = MIN_PARTIAL; else if (min > MAX_PARTIAL) min = MAX_PARTIAL; s->min_partial = min; }Limits the number of slabs kept in a NUMA node’s partial list.
8. set_cpu_partial – Per‑CPU Limits
#ifdef CONFIG_SLUB_CPU_PARTIAL static void set_cpu_partial(struct kmem_cache *s) { if (!kmem_cache_has_cpu_partial(s)) s->cpu_partial = 0; else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; else if (s->size >= 1024) s->cpu_partial = 6; else if (s->size >= 256) s->cpu_partial = 13; else s->cpu_partial = 30; } #endifDetermines how many free objects a per‑CPU partial list may hold before being flushed to the node cache.
9. init_kmem_cache_nodes – Allocating NUMA Node Caches
static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); init_kmem_cache_node(n); s->node[node] = n; } return 1; }During early boot ( slab_state == DOWN) the helper early_kmem_cache_node_alloc creates a single slab for each node; later the normal allocator is used.
early_kmem_cache_node_alloc
static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; page = new_slab(kmem_cache_node, GFP_NOWAIT, node); n = page->freelist; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); #endif page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); __add_partial(n, page, DEACTIVATE_TO_HEAD); }This creates a slab for kmem_cache_node and stores the resulting kmem_cache_node object in the node’s partial list.
10. alloc_kmem_cache_cpus – Per‑CPU Cache Allocation
static int alloc_kmem_cache_cpus(struct kmem_cache *s) { s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); init_kmem_cache_cpus(s); return 1; } static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); }Each CPU gets its own kmem_cache_cpu structure, which holds fast‑path lists.
11. The Final Slab Cache Structure
After all initialisation steps the cache looks like:
12. Bootstrapping the First Slab Caches
The kernel needs a slab cache to allocate kmem_cache and kmem_cache_node objects, but those caches themselves do not exist yet. The solution is to define two static objects:
static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node;These are initialised with create_boot_cache before the allocator is fully operational.
static void __init create_boot_cache(struct kmem_cache *s, const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize) { int err; unsigned int align = ARCH_KMALLOC_MINALIGN; s->name = name; s->size = s->object_size = size; if (is_power_of_2(size)) align = max(align, size); s->align = calculate_alignment(flags, align, size); s->useroffset = useroffset; s->usersize = usersize; err = __kmem_cache_create(s, flags); s->refcount = -1; }First boot_kmem_cache_node is created (its own slab cache), then boot_kmem_cache. After both are ready the kernel copies them into the real global caches:
kmem_cache = bootstrap(&boot_kmem_cache); kmem_cache_node = bootstrap(&boot_kmem_cache_node);The bootstrap function copies the static structures, fixes up internal pointers, and adds the new caches to the global slab_caches list.
13. Summary
The Linux slab allocator builds a cache by:
Performing sanity checks and flag validation.
Calculating the final object size, alignment, and optional debugging padding.
Choosing an order that balances fragmentation and minimum object count.
Creating per‑CPU and per‑NUMA node structures.
Registering the cache in /sys/kernel/slab.
Bootstrapping uses two static caches to break the circular dependency between kmem_cache and kmem_cache_node, after which the allocator can dynamically create any further caches.
Bin's Tech Cabin
Original articles dissecting source code and sharing personal tech insights. A modest space for serious discussion, free from noise and bureaucracy.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
