Blame - kernel/cgroup_pids.c - SHIFTPHONES/mainline/linux

blob: d75488824ae222f5aa55d13c963333bc7aa4e4f8 [file] [log] [blame]

Aleksa Sarai	49b786e	2015-06-09 21:32:10 +1000	[diff] [blame^]	1	/*
				2	* Process number limiting controller for cgroups.
				3	*
				4	* Used to allow a cgroup hierarchy to stop any new processes from fork()ing
				5	* after a certain limit is reached.
				6	*
				7	* Since it is trivial to hit the task limit without hitting any kmemcg limits
				8	* in place, PIDs are a fundamental resource. As such, PID exhaustion must be
				9	* preventable in the scope of a cgroup hierarchy by allowing resource limiting
				10	* of the number of tasks in a cgroup.
				11	*
				12	* In order to use the `pids` controller, set the maximum number of tasks in
				13	* pids.max (this is not available in the root cgroup for obvious reasons). The
				14	* number of processes currently in the cgroup is given by pids.current.
				15	* Organisational operations are not blocked by cgroup policies, so it is
				16	* possible to have pids.current > pids.max. However, it is not possible to
				17	* violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
				18	* would cause a cgroup policy to be violated.
				19	*
				20	* To set a cgroup to have no limit, set pids.max to "max". This is the default
				21	* for all new cgroups (N.B. that PID limits are hierarchical, so the most
				22	* stringent limit in the hierarchy is followed).
				23	*
				24	* pids.current tracks all child cgroup hierarchies, so parent/pids.current is
				25	* a superset of parent/child/pids.current.
				26	*
				27	* Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
				28	*
				29	* This file is subject to the terms and conditions of version 2 of the GNU
				30	* General Public License. See the file COPYING in the main directory of the
				31	* Linux distribution for more details.
				32	*/
				33
				34	#include <linux/kernel.h>
				35	#include <linux/threads.h>
				36	#include <linux/atomic.h>
				37	#include <linux/cgroup.h>
				38	#include <linux/slab.h>
				39
				40	#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
				41	#define PIDS_MAX_STR "max"
				42
				43	struct pids_cgroup {
				44	struct cgroup_subsys_state css;
				45
				46	/*
				47	* Use 64-bit types so that we can safely represent "max" as
				48	* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
				49	*/
				50	atomic64_t counter;
				51	int64_t limit;
				52	};
				53
				54	static struct pids_cgroup css_pids(struct cgroup_subsys_state css)
				55	{
				56	return container_of(css, struct pids_cgroup, css);
				57	}
				58
				59	static struct pids_cgroup parent_pids(struct pids_cgroup pids)
				60	{
				61	return css_pids(pids->css.parent);
				62	}
				63
				64	static struct cgroup_subsys_state *
				65	pids_css_alloc(struct cgroup_subsys_state *parent)
				66	{
				67	struct pids_cgroup *pids;
				68
				69	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
				70	if (!pids)
				71	return ERR_PTR(-ENOMEM);
				72
				73	pids->limit = PIDS_MAX;
				74	atomic64_set(&pids->counter, 0);
				75	return &pids->css;
				76	}
				77
				78	static void pids_css_free(struct cgroup_subsys_state *css)
				79	{
				80	kfree(css_pids(css));
				81	}
				82
				83	/**
				84	* pids_cancel - uncharge the local pid count
				85	* @pids: the pid cgroup state
				86	* @num: the number of pids to cancel
				87	*
				88	* This function will WARN if the pid count goes under 0, because such a case is
				89	* a bug in the pids controller proper.
				90	*/
				91	static void pids_cancel(struct pids_cgroup *pids, int num)
				92	{
				93	/*
				94	* A negative count (or overflow for that matter) is invalid,
				95	* and indicates a bug in the `pids` controller proper.
				96	*/
				97	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
				98	}
				99
				100	/**
				101	* pids_uncharge - hierarchically uncharge the pid count
				102	* @pids: the pid cgroup state
				103	* @num: the number of pids to uncharge
				104	*/
				105	static void pids_uncharge(struct pids_cgroup *pids, int num)
				106	{
				107	struct pids_cgroup *p;
				108
				109	for (p = pids; p; p = parent_pids(p))
				110	pids_cancel(p, num);
				111	}
				112
				113	/**
				114	* pids_charge - hierarchically charge the pid count
				115	* @pids: the pid cgroup state
				116	* @num: the number of pids to charge
				117	*
				118	* This function does not follow the pid limit set. It cannot fail and the new
				119	* pid count may exceed the limit. This is only used for reverting failed
				120	* attaches, where there is no other way out than violating the limit.
				121	*/
				122	static void pids_charge(struct pids_cgroup *pids, int num)
				123	{
				124	struct pids_cgroup *p;
				125
				126	for (p = pids; p; p = parent_pids(p))
				127	atomic64_add(num, &p->counter);
				128	}
				129
				130	/**
				131	* pids_try_charge - hierarchically try to charge the pid count
				132	* @pids: the pid cgroup state
				133	* @num: the number of pids to charge
				134	*
				135	* This function follows the set limit. It will fail if the charge would cause
				136	* the new value to exceed the hierarchical limit. Returns 0 if the charge
				137	* succeded, otherwise -EAGAIN.
				138	*/
				139	static int pids_try_charge(struct pids_cgroup *pids, int num)
				140	{
				141	struct pids_cgroup p, q;
				142
				143	for (p = pids; p; p = parent_pids(p)) {
				144	int64_t new = atomic64_add_return(num, &p->counter);
				145
				146	/*
				147	* Since new is capped to the maximum number of pid_t, if
				148	* p->limit is %PIDS_MAX then we know that this test will never
				149	* fail.
				150	*/
				151	if (new > p->limit)
				152	goto revert;
				153	}
				154
				155	return 0;
				156
				157	revert:
				158	for (q = pids; q != p; q = parent_pids(q))
				159	pids_cancel(q, num);
				160	pids_cancel(p, num);
				161
				162	return -EAGAIN;
				163	}
				164
				165	static int pids_can_attach(struct cgroup_subsys_state *css,
				166	struct cgroup_taskset *tset)
				167	{
				168	struct pids_cgroup *pids = css_pids(css);
				169	struct task_struct *task;
				170
				171	cgroup_taskset_for_each(task, tset) {
				172	struct cgroup_subsys_state *old_css;
				173	struct pids_cgroup *old_pids;
				174
				175	/*
				176	* Grab a ref to each task's css. We don't drop the ref until
				177	* we either fail and hit ->cancel_attach() or succeed and hit
				178	* ->attach().
				179	*/
				180	old_css = task_get_css(task, pids_cgrp_id);
				181	old_pids = css_pids(old_css);
				182
				183	pids_charge(pids, 1);
				184	pids_uncharge(old_pids, 1);
				185	}
				186
				187	return 0;
				188	}
				189
				190	static void pids_cancel_attach(struct cgroup_subsys_state *css,
				191	struct cgroup_taskset *tset)
				192	{
				193	struct pids_cgroup *pids = css_pids(css);
				194	struct task_struct *task;
				195
				196	cgroup_taskset_for_each(task, tset) {
				197	struct cgroup_subsys_state *old_css;
				198	struct pids_cgroup *old_pids;
				199
				200	old_css = task_css(task, pids_cgrp_id);
				201	old_pids = css_pids(old_css);
				202
				203	pids_charge(old_pids, 1);
				204	pids_uncharge(pids, 1);
				205	css_put(old_css);
				206	}
				207	}
				208
				209	static void pids_attach(struct cgroup_subsys_state *css,
				210	struct cgroup_taskset *tset)
				211	{
				212	struct task_struct *task;
				213
				214	cgroup_taskset_for_each(task, tset)
				215	css_put(task_css(task, pids_cgrp_id));
				216	}
				217
				218	static int pids_can_fork(struct task_struct task, void *priv_p)
				219	{
				220	struct cgroup_subsys_state *css;
				221	struct pids_cgroup *pids;
				222	int err;
				223
				224	/*
				225	* Use the "current" task_css for the pids subsystem as the tentative
				226	* css. It is possible we will charge the wrong hierarchy, in which
				227	* case we will forcefully revert/reapply the charge on the right
				228	* hierarchy after it is committed to the task proper.
				229	*/
				230	css = task_get_css(current, pids_cgrp_id);
				231	pids = css_pids(css);
				232
				233	err = pids_try_charge(pids, 1);
				234	if (err)
				235	goto err_css_put;
				236
				237	*priv_p = css;
				238	return 0;
				239
				240	err_css_put:
				241	css_put(css);
				242	return err;
				243	}
				244
				245	static void pids_cancel_fork(struct task_struct task, void priv)
				246	{
				247	struct cgroup_subsys_state *css = priv;
				248	struct pids_cgroup *pids = css_pids(css);
				249
				250	pids_uncharge(pids, 1);
				251	css_put(css);
				252	}
				253
				254	static void pids_fork(struct task_struct task, void priv)
				255	{
				256	struct cgroup_subsys_state *css;
				257	struct cgroup_subsys_state *old_css = priv;
				258	struct pids_cgroup *pids;
				259	struct pids_cgroup *old_pids = css_pids(old_css);
				260
				261	css = task_get_css(task, pids_cgrp_id);
				262	pids = css_pids(css);
				263
				264	/*
				265	* If the association has changed, we have to revert and reapply the
				266	* charge/uncharge on the wrong hierarchy to the current one. Since
				267	* the association can only change due to an organisation event, its
				268	* okay for us to ignore the limit in this case.
				269	*/
				270	if (pids != old_pids) {
				271	pids_uncharge(old_pids, 1);
				272	pids_charge(pids, 1);
				273	}
				274
				275	css_put(css);
				276	css_put(old_css);
				277	}
				278
				279	static void pids_exit(struct cgroup_subsys_state *css,
				280	struct cgroup_subsys_state *old_css,
				281	struct task_struct *task)
				282	{
				283	struct pids_cgroup *pids = css_pids(old_css);
				284
				285	pids_uncharge(pids, 1);
				286	}
				287
				288	static ssize_t pids_max_write(struct kernfs_open_file of, char buf,
				289	size_t nbytes, loff_t off)
				290	{
				291	struct cgroup_subsys_state *css = of_css(of);
				292	struct pids_cgroup *pids = css_pids(css);
				293	int64_t limit;
				294	int err;
				295
				296	buf = strstrip(buf);
				297	if (!strcmp(buf, PIDS_MAX_STR)) {
				298	limit = PIDS_MAX;
				299	goto set_limit;
				300	}
				301
				302	err = kstrtoll(buf, 0, &limit);
				303	if (err)
				304	return err;
				305
				306	if (limit < 0 \|\| limit >= PIDS_MAX)
				307	return -EINVAL;
				308
				309	set_limit:
				310	/*
				311	* Limit updates don't need to be mutex'd, since it isn't
				312	* critical that any racing fork()s follow the new limit.
				313	*/
				314	pids->limit = limit;
				315	return nbytes;
				316	}
				317
				318	static int pids_max_show(struct seq_file sf, void v)
				319	{
				320	struct cgroup_subsys_state *css = seq_css(sf);
				321	struct pids_cgroup *pids = css_pids(css);
				322	int64_t limit = pids->limit;
				323
				324	if (limit >= PIDS_MAX)
				325	seq_printf(sf, "%s\n", PIDS_MAX_STR);
				326	else
				327	seq_printf(sf, "%lld\n", limit);
				328
				329	return 0;
				330	}
				331
				332	static s64 pids_current_read(struct cgroup_subsys_state *css,
				333	struct cftype *cft)
				334	{
				335	struct pids_cgroup *pids = css_pids(css);
				336
				337	return atomic64_read(&pids->counter);
				338	}
				339
				340	static struct cftype pids_files[] = {
				341	{
				342	.name = "max",
				343	.write = pids_max_write,
				344	.seq_show = pids_max_show,
				345	.flags = CFTYPE_NOT_ON_ROOT,
				346	},
				347	{
				348	.name = "current",
				349	.read_s64 = pids_current_read,
				350	},
				351	{ } /* terminate */
				352	};
				353
				354	struct cgroup_subsys pids_cgrp_subsys = {
				355	.css_alloc = pids_css_alloc,
				356	.css_free = pids_css_free,
				357	.attach = pids_attach,
				358	.can_attach = pids_can_attach,
				359	.cancel_attach = pids_cancel_attach,
				360	.can_fork = pids_can_fork,
				361	.cancel_fork = pids_cancel_fork,
				362	.fork = pids_fork,
				363	.exit = pids_exit,
				364	.legacy_cftypes = pids_files,
				365	.dfl_cftypes = pids_files,
				366	};