[PATCH] NUMA slab locking fixes: fix cpu down and up locking
This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab
allocator. Sonny Rao <sonny@burdell.org> reported problems sometime back on
POWER5 boxes, when the last cpu on the nodes were being offlined. We could
not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not
being updated on cpu down. Since that issue is now fixed, we can reproduce
Sonny's problems on x86_64 NUMA, and here is the fix.
The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go
down, the array_caches (shared, alien) and the kmem_list3 of the node were
being freed (kfree) with the kmem_list3 lock held. If the l3 or the
array_caches were to come from the same cache being cleared, we hit on
badness.
This patch cleans up the locking in cpu_up and cpu_down path. We cannot
really free l3 on cpu down because, there is no node offlining yet and even
though a cpu is not yet up, node local memory can be allocated for it. So l3s
are usually allocated at keme_cache_create and destroyed at
kmem_cache_destroy. Hence, we don't need cachep->spinlock protection to get
to the cachep->nodelist[nodeid] either.
Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4
dbench process running all the time.
Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/mm/slab.c b/mm/slab.c
index d3f6854..9cc049a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -884,14 +884,14 @@
}
}
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
{
int i = 0;
struct array_cache *ac;
unsigned long flags;
for_each_online_node(i) {
- ac = l3->alien[i];
+ ac = alien[i];
if (ac) {
spin_lock_irqsave(&ac->lock, flags);
__drain_alien_cache(cachep, ac, i);
@@ -901,8 +901,11 @@
}
#else
#define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+#define drain_alien_cache(cachep, alien) do { } while (0)
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
#endif
static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -936,6 +939,11 @@
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
cachep->nodelists[node] = l3;
}
@@ -950,26 +958,47 @@
& array cache's */
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
+ cachep->batchcount);
if (!nc)
goto bad;
+ shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount,
+ 0xbaadf00d);
+ if (!shared)
+ goto bad;
+#ifdef CONFIG_NUMA
+ alien = alloc_alien_cache(node, cachep->limit);
+ if (!alien)
+ goto bad;
+#endif
cachep->array[cpu] = nc;
l3 = cachep->nodelists[node];
BUG_ON(!l3);
- if (!l3->shared) {
- if (!(nc = alloc_arraycache(node,
- cachep->shared *
- cachep->batchcount,
- 0xbaadf00d)))
- goto bad;
- /* we are serialised from CPU_DEAD or
- CPU_UP_CANCELLED by the cpucontrol lock */
- l3->shared = nc;
+ spin_lock_irq(&l3->list_lock);
+ if (!l3->shared) {
+ /*
+ * We are serialised from CPU_DEAD or
+ * CPU_UP_CANCELLED by the cpucontrol lock
+ */
+ l3->shared = shared;
+ shared = NULL;
}
+#ifdef CONFIG_NUMA
+ if (!l3->alien) {
+ l3->alien = alien;
+ alien = NULL;
+ }
+#endif
+ spin_unlock_irq(&l3->list_lock);
+
+ kfree(shared);
+ free_alien_cache(alien);
}
mutex_unlock(&cache_chain_mutex);
break;
@@ -978,23 +1007,32 @@
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ /*
+ * Even if all the cpus of a node are down, we don't free the
+ * kmem_list3 of any cache. This to avoid a race between
+ * cpu_down, and a kmalloc allocation from another cpu for
+ * memory from the node of the cpu going down. The list3
+ * structure is usually allocated from kmem_cache_create() and
+ * gets destroyed at kmem_cache_destroy().
+ */
/* fall thru */
case CPU_UP_CANCELED:
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
cpumask_t mask;
mask = node_to_cpumask(node);
- spin_lock(&cachep->spinlock);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
l3 = cachep->nodelists[node];
if (!l3)
- goto unlock_cache;
+ goto free_array_cache;
spin_lock_irq(&l3->list_lock);
@@ -1005,33 +1043,43 @@
if (!cpus_empty(mask)) {
spin_unlock_irq(&l3->list_lock);
- goto unlock_cache;
+ goto free_array_cache;
}
- if (l3->shared) {
+ shared = l3->shared;
+ if (shared) {
free_block(cachep, l3->shared->entry,
l3->shared->avail, node);
- kfree(l3->shared);
l3->shared = NULL;
}
- if (l3->alien) {
- drain_alien_cache(cachep, l3);
- free_alien_cache(l3->alien);
- l3->alien = NULL;
- }
- /* free slabs belonging to this node */
- if (__node_shrink(cachep, node)) {
- cachep->nodelists[node] = NULL;
- spin_unlock_irq(&l3->list_lock);
- kfree(l3);
- } else {
- spin_unlock_irq(&l3->list_lock);
+ alien = l3->alien;
+ l3->alien = NULL;
+
+ spin_unlock_irq(&l3->list_lock);
+
+ kfree(shared);
+ if (alien) {
+ drain_alien_cache(cachep, alien);
+ free_alien_cache(alien);
}
- unlock_cache:
- spin_unlock(&cachep->spinlock);
+free_array_cache:
kfree(nc);
}
+ /*
+ * In the previous loop, all the objects were freed to
+ * the respective cache's slabs, now we can go ahead and
+ * shrink each nodelist to its limit.
+ */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+ spin_lock_irq(&l3->list_lock);
+ /* free slabs belonging to this node */
+ __node_shrink(cachep, node);
+ spin_unlock_irq(&l3->list_lock);
+ }
mutex_unlock(&cache_chain_mutex);
break;
#endif
@@ -2011,7 +2059,6 @@
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
- spin_lock(&cachep->spinlock);
for_each_online_node(node) {
l3 = cachep->nodelists[node];
if (l3) {
@@ -2019,10 +2066,9 @@
drain_array_locked(cachep, l3->shared, 1, node);
spin_unlock_irq(&l3->list_lock);
if (l3->alien)
- drain_alien_cache(cachep, l3);
+ drain_alien_cache(cachep, l3->alien);
}
}
- spin_unlock(&cachep->spinlock);
}
static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -3440,7 +3486,7 @@
l3 = searchp->nodelists[numa_node_id()];
if (l3->alien)
- drain_alien_cache(searchp, l3);
+ drain_alien_cache(searchp, l3->alien);
spin_lock_irq(&l3->list_lock);
drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3598,7 +3644,8 @@
num_slabs++;
}
free_objects += l3->free_objects;
- shared_avail += l3->shared->avail;
+ if (l3->shared)
+ shared_avail += l3->shared->avail;
spin_unlock_irq(&l3->list_lock);
}