memcg: fix possible use-after-free in memcg_kmem_get_cache()
Suppose task @t that belongs to a memory cgroup @memcg is going to
allocate an object from a kmem cache @c. The copy of @c corresponding to
@memcg, @mc, is empty. Then if kmem_cache_alloc races with the memory
cgroup destruction we can access the memory cgroup's copy of the cache
after it was destroyed:
CPU0 CPU1
---- ----
[ current=@t
@mc->memcg_params->nr_pages=0 ]
kmem_cache_alloc(@c):
call memcg_kmem_get_cache(@c);
proceed to allocation from @mc:
alloc a page for @mc:
...
move @t from @memcg
destroy @memcg:
mem_cgroup_css_offline(@memcg):
memcg_unregister_all_caches(@memcg):
kmem_cache_destroy(@mc)
add page to @mc
We could fix this issue by taking a reference to a per-memcg cache, but
that would require adding a per-cpu reference counter to per-memcg caches,
which would look cumbersome.
Instead, let's take a reference to a memory cgroup, which already has a
per-cpu reference counter, in the beginning of kmem_cache_alloc to be
dropped in the end, and move per memcg caches destruction from css offline
to css free. As a side effect, per-memcg caches will be destroyed not one
by one, but all at once when the last page accounted to the memory cgroup
is freed. This doesn't sound as a high price for code readability though.
Note, this patch does add some overhead to the kmem_cache_alloc hot path,
but it is pretty negligible - it's just a function call plus a per cpu
counter decrement, which is comparable to what we already have in
memcg_kmem_get_cache. Besides, it's only relevant if there are memory
cgroups with kmem accounting enabled. I don't think we can find a way to
handle this race w/o it, because alloc_page called from kmem_cache_alloc
may sleep so we can't flush all pending kmallocs w/o reference counting.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dac81b9..05e1584 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2635,7 +2635,6 @@
if (!cachep)
return;
- css_get(&memcg->css);
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
/*
@@ -2669,9 +2668,6 @@
list_del(&cachep->memcg_params->list);
kmem_cache_destroy(cachep);
-
- /* drop the reference taken in memcg_register_cache */
- css_put(&memcg->css);
}
int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2705,9 +2701,7 @@
mutex_lock(&memcg_slab_mutex);
list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params);
- kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- memcg_unregister_cache(cachep);
+ memcg_unregister_cache(cachep);
}
mutex_unlock(&memcg_slab_mutex);
}
@@ -2742,10 +2736,10 @@
struct memcg_register_cache_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
- if (cw == NULL) {
- css_put(&memcg->css);
+ if (!cw)
return;
- }
+
+ css_get(&memcg->css);
cw->memcg = memcg;
cw->cachep = cachep;
@@ -2776,12 +2770,8 @@
int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
{
unsigned int nr_pages = 1 << order;
- int res;
- res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
- if (!res)
- atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
- return res;
+ return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
}
void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2789,7 +2779,6 @@
unsigned int nr_pages = 1 << order;
memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
- atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
}
/*
@@ -2816,22 +2805,13 @@
if (current->memcg_kmem_skip_account)
return cachep;
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
-
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_kmem_is_active(memcg))
goto out;
memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
- if (likely(memcg_cachep)) {
- cachep = memcg_cachep;
- goto out;
- }
-
- /* The corresponding put will be done in the workqueue. */
- if (!css_tryget_online(&memcg->css))
- goto out;
- rcu_read_unlock();
+ if (likely(memcg_cachep))
+ return memcg_cachep;
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2846,12 +2826,17 @@
* defer everything.
*/
memcg_schedule_register_cache(memcg, cachep);
- return cachep;
out:
- rcu_read_unlock();
+ css_put(&memcg->css);
return cachep;
}
+void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+ if (!is_root_cache(cachep))
+ css_put(&cachep->memcg_params->memcg->css);
+}
+
/*
* We need to verify if the allocation against current->mm->owner's memcg is
* possible for the given order. But the page is not allocated yet, so we'll
@@ -2914,10 +2899,6 @@
memcg_uncharge_kmem(memcg, 1 << order);
page->mem_cgroup = NULL;
}
-#else
-static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -4188,6 +4169,7 @@
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
+ memcg_unregister_all_caches(memcg);
mem_cgroup_sockets_destroy(memcg);
}
#else
@@ -4797,7 +4779,6 @@
}
spin_unlock(&memcg->event_list_lock);
- memcg_unregister_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}