slub: support concurrent local and remote frees and allocs on a slab
Avoid atomic overhead in slab_alloc and slab_free
SLUB needs to use the slab_lock for the per cpu slabs to synchronize with
potential kfree operations. This patch avoids that need by moving all free
objects onto a lockless_freelist. The regular freelist continues to exist
and will be used to free objects. So while we consume the
lockless_freelist the regular freelist may build up objects.
If we are out of objects on the lockless_freelist then we may check the
regular freelist. If it has objects then we move those over to the
lockless_freelist and do this again. There is a significant savings in
terms of atomic operations that have to be performed.
We can even free directly to the lockless_freelist if we know that we are
running on the same processor. So this speeds up short lived objects.
They may be allocated and freed without taking the slab_lock. This is
particular good for netperf.
In order to maximize the effect of the new faster hotpath we extract the
hottest performance pieces into inlined functions. These are then inlined
into kmem_cache_alloc and kmem_cache_free. So hotpath allocation and
freeing no longer requires a subroutine call within SLUB.
[I am not sure that it is worth doing this because it changes the easy to
read structure of slub just to reduce atomic ops. However, there is
someone out there with a benchmark on 4 way and 8 way processor systems
that seems to show a 5% regression vs. Slab. Seems that the regression is
due to increased atomic operations use vs. SLAB in SLUB). I wonder if
this is applicable or discernable at all in a real workload?]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/slub.c b/mm/slub.c
index bd2efae..b07a1ca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -81,10 +81,14 @@
* PageActive The slab is used as a cpu cache. Allocations
* may be performed from the slab. The slab is not
* on any slab list and cannot be moved onto one.
+ * The cpu slab may be equipped with an additioanl
+ * lockless_freelist that allows lockless access to
+ * free objects in addition to the regular freelist
+ * that requires the slab lock.
*
* PageError Slab requires special handling due to debug
* options set. This moves slab handling out of
- * the fast path.
+ * the fast path and disables lockless freelists.
*/
static inline int SlabDebug(struct page *page)
@@ -1014,6 +1018,7 @@
set_freepointer(s, last, NULL);
page->freelist = start;
+ page->lockless_freelist = NULL;
page->inuse = 0;
out:
if (flags & __GFP_WAIT)
@@ -1276,6 +1281,23 @@
*/
static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
{
+ /*
+ * Merge cpu freelist into freelist. Typically we get here
+ * because both freelists are empty. So this is unlikely
+ * to occur.
+ */
+ while (unlikely(page->lockless_freelist)) {
+ void **object;
+
+ /* Retrieve object from cpu_freelist */
+ object = page->lockless_freelist;
+ page->lockless_freelist = page->lockless_freelist[page->offset];
+
+ /* And put onto the regular freelist */
+ object[page->offset] = page->freelist;
+ page->freelist = object;
+ page->inuse--;
+ }
s->cpu_slab[cpu] = NULL;
ClearPageActive(page);
@@ -1322,47 +1344,46 @@
}
/*
- * slab_alloc is optimized to only modify two cachelines on the fast path
- * (aside from the stack):
+ * Slow path. The lockless freelist is empty or we need to perform
+ * debugging duties.
*
- * 1. The page struct
- * 2. The first cacheline of the object to be allocated.
+ * Interrupts are disabled.
*
- * The only other cache lines that are read (apart from code) is the
- * per cpu array in the kmem_cache struct.
+ * Processing is still very fast if new objects have been freed to the
+ * regular freelist. In that case we simply take over the regular freelist
+ * as the lockless freelist and zap the regular freelist.
*
- * Fastpath is not possible if we need to get a new slab or have
- * debugging enabled (which means all slabs are marked with SlabDebug)
+ * If that is not working then we fall back to the partial lists. We take the
+ * first element of the freelist as the object to allocate now and move the
+ * rest of the freelist to the lockless freelist.
+ *
+ * And if we were unable to get a new slab from the partial slab lists then
+ * we need to allocate a new slab. This is slowest path since we may sleep.
*/
-static void *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr)
+static void *__slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, int node, void *addr, struct page *page)
{
- struct page *page;
void **object;
- unsigned long flags;
- int cpu;
+ int cpu = smp_processor_id();
- local_irq_save(flags);
- cpu = smp_processor_id();
- page = s->cpu_slab[cpu];
if (!page)
goto new_slab;
slab_lock(page);
if (unlikely(node != -1 && page_to_nid(page) != node))
goto another_slab;
-redo:
+load_freelist:
object = page->freelist;
if (unlikely(!object))
goto another_slab;
if (unlikely(SlabDebug(page)))
goto debug;
-have_object:
- page->inuse++;
- page->freelist = object[page->offset];
+ object = page->freelist;
+ page->lockless_freelist = object[page->offset];
+ page->inuse = s->objects;
+ page->freelist = NULL;
slab_unlock(page);
- local_irq_restore(flags);
return object;
another_slab:
@@ -1370,11 +1391,11 @@
new_slab:
page = get_partial(s, gfpflags, node);
- if (likely(page)) {
+ if (page) {
have_slab:
s->cpu_slab[cpu] = page;
SetPageActive(page);
- goto redo;
+ goto load_freelist;
}
page = new_slab(s, gfpflags, node);
@@ -1397,7 +1418,7 @@
discard_slab(s, page);
page = s->cpu_slab[cpu];
slab_lock(page);
- goto redo;
+ goto load_freelist;
}
/* New slab does not fit our expectations */
flush_slab(s, s->cpu_slab[cpu], cpu);
@@ -1405,16 +1426,52 @@
slab_lock(page);
goto have_slab;
}
- local_irq_restore(flags);
return NULL;
debug:
+ object = page->freelist;
if (!alloc_object_checks(s, page, object))
goto another_slab;
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_ALLOC, addr);
trace(s, page, object, 1);
init_object(s, object, 1);
- goto have_object;
+
+ page->inuse++;
+ page->freelist = object[page->offset];
+ slab_unlock(page);
+ return object;
+}
+
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static void __always_inline *slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, int node, void *addr)
+{
+ struct page *page;
+ void **object;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ page = s->cpu_slab[smp_processor_id()];
+ if (unlikely(!page || !page->lockless_freelist ||
+ (node != -1 && page_to_nid(page) != node)))
+
+ object = __slab_alloc(s, gfpflags, node, addr, page);
+
+ else {
+ object = page->lockless_freelist;
+ page->lockless_freelist = object[page->offset];
+ }
+ local_irq_restore(flags);
+ return object;
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1432,20 +1489,19 @@
#endif
/*
- * The fastpath only writes the cacheline of the page struct and the first
- * cacheline of the object.
+ * Slow patch handling. This may still be called frequently since objects
+ * have a longer lifetime than the cpu slabs in most processing loads.
*
- * We read the cpu_slab cacheline to check if the slab is the per cpu
- * slab for this processor.
+ * So we still attempt to reduce cache line usage. Just take the slab
+ * lock and free the item. If there is no additional partial page
+ * handling required then we can return immediately.
*/
-static void slab_free(struct kmem_cache *s, struct page *page,
+static void __slab_free(struct kmem_cache *s, struct page *page,
void *x, void *addr)
{
void *prior;
void **object = (void *)x;
- unsigned long flags;
- local_irq_save(flags);
slab_lock(page);
if (unlikely(SlabDebug(page)))
@@ -1475,7 +1531,6 @@
out_unlock:
slab_unlock(page);
- local_irq_restore(flags);
return;
slab_empty:
@@ -1487,7 +1542,6 @@
slab_unlock(page);
discard_slab(s, page);
- local_irq_restore(flags);
return;
debug:
@@ -1502,6 +1556,34 @@
goto checks_ok;
}
+/*
+ * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
+ * can perform fastpath freeing without additional function calls.
+ *
+ * The fastpath is only possible if we are freeing to the current cpu slab
+ * of this processor. This typically the case if we have just allocated
+ * the item before.
+ *
+ * If fastpath is not possible then fall back to __slab_free where we deal
+ * with all sorts of special processing.
+ */
+static void __always_inline slab_free(struct kmem_cache *s,
+ struct page *page, void *x, void *addr)
+{
+ void **object = (void *)x;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (likely(page == s->cpu_slab[smp_processor_id()] &&
+ !SlabDebug(page))) {
+ object[page->offset] = page->lockless_freelist;
+ page->lockless_freelist = object;
+ } else
+ __slab_free(s, page, x, addr);
+
+ local_irq_restore(flags);
+}
+
void kmem_cache_free(struct kmem_cache *s, void *x)
{
struct page *page;