Blame - kernel/cgroup/cgroup-v1.c - SHIFTPHONES/mainline/linux

blob: c126b34fd4ff583af524f52bf973b9734acf9b9e [file] [log] [blame]

Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	1	#include "cgroup-internal.h"
				2
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	3	#include <linux/ctype.h>
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	4	#include <linux/kmod.h>
				5	#include <linux/sort.h>
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	6	#include <linux/delay.h>
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	7	#include <linux/mm.h>
Ingo Molnar	c3edc40	2017-02-02 08:35:14 +0100	[diff] [blame]	8	#include <linux/sched/signal.h>
Ingo Molnar	56cd697	2017-02-06 10:57:33 +0100	[diff] [blame]	9	#include <linux/sched/task.h>
Ingo Molnar	50ff9d1	2017-02-05 16:03:58 +0100	[diff] [blame]	10	#include <linux/magic.h>
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	11	#include <linux/slab.h>
				12	#include <linux/vmalloc.h>
				13	#include <linux/delayacct.h>
				14	#include <linux/pid_namespace.h>
				15	#include <linux/cgroupstats.h>
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	16	#include <linux/fs_parser.h>
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	17
				18	#include <trace/events/cgroup.h>
				19
David Howells	06a2ae5	2018-11-01 23:07:26 +0000	[diff] [blame]	20	#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	21
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	22	/*
				23	* pidlists linger the following amount before being destroyed. The goal
				24	* is avoiding frequent destruction in the middle of consecutive read calls
				25	* Expiring in the middle is a performance problem not a correctness one.
				26	* 1 sec should be enough.
				27	*/
				28	#define CGROUP_PIDLIST_DESTROY_DELAY HZ
				29
				30	/* Controllers blocked by the commandline in v1 */
				31	static u16 cgroup_no_v1_mask;
				32
Tejun Heo	3fc9c12	2018-12-28 10:31:07 -0800	[diff] [blame]	33	/* disable named v1 mounts */
				34	static bool cgroup_no_v1_named;
				35
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	36	/*
				37	* pidlist destructions need to be flushed on cgroup destruction. Use a
				38	* separate workqueue as flush domain.
				39	*/
				40	static struct workqueue_struct *cgroup_pidlist_destroy_wq;
				41
				42	/*
				43	* Protects cgroup_subsys->release_agent_path. Modifying it also requires
				44	* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
				45	*/
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	46	static DEFINE_SPINLOCK(release_agent_path_lock);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	47
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	48	bool cgroup1_ssid_disabled(int ssid)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	49	{
				50	return cgroup_no_v1_mask & (1 << ssid);
				51	}
				52
				53	/**
				54	* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
				55	* @from: attach to all cgroups of a given task
				56	* @tsk: the task to be attached
				57	*/
				58	int cgroup_attach_task_all(struct task_struct from, struct task_struct tsk)
				59	{
				60	struct cgroup_root *root;
				61	int retval = 0;
				62
				63	mutex_lock(&cgroup_mutex);
				64	percpu_down_write(&cgroup_threadgroup_rwsem);
				65	for_each_root(root) {
				66	struct cgroup *from_cgrp;
				67
				68	if (root == &cgrp_dfl_root)
				69	continue;
				70
				71	spin_lock_irq(&css_set_lock);
				72	from_cgrp = task_cgroup_from_root(from, root);
				73	spin_unlock_irq(&css_set_lock);
				74
				75	retval = cgroup_attach_task(from_cgrp, tsk, false);
				76	if (retval)
				77	break;
				78	}
				79	percpu_up_write(&cgroup_threadgroup_rwsem);
				80	mutex_unlock(&cgroup_mutex);
				81
				82	return retval;
				83	}
				84	EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
				85
				86	/**
				87	* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
				88	* @to: cgroup to which the tasks will be moved
				89	* @from: cgroup in which the tasks currently reside
				90	*
				91	* Locking rules between cgroup_post_fork() and the migration path
				92	* guarantee that, if a task is forking while being migrated, the new child
				93	* is guaranteed to be either visible in the source cgroup after the
				94	* parent's migration is complete or put into the target cgroup. No task
				95	* can slip out of migration through forking.
				96	*/
				97	int cgroup_transfer_tasks(struct cgroup to, struct cgroup from)
				98	{
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	99	DEFINE_CGROUP_MGCTX(mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	100	struct cgrp_cset_link *link;
				101	struct css_task_iter it;
				102	struct task_struct *task;
				103	int ret;
				104
				105	if (cgroup_on_dfl(to))
				106	return -EINVAL;
				107
Tejun Heo	8cfd814	2017-07-21 11:14:51 -0400	[diff] [blame]	108	ret = cgroup_migrate_vet_dst(to);
				109	if (ret)
				110	return ret;
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	111
				112	mutex_lock(&cgroup_mutex);
				113
				114	percpu_down_write(&cgroup_threadgroup_rwsem);
				115
				116	/* all tasks in @from are being moved, all csets are source */
				117	spin_lock_irq(&css_set_lock);
				118	list_for_each_entry(link, &from->cset_links, cset_link)
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	119	cgroup_migrate_add_src(link->cset, to, &mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	120	spin_unlock_irq(&css_set_lock);
				121
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	122	ret = cgroup_migrate_prepare_dst(&mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	123	if (ret)
				124	goto out_err;
				125
				126	/*
				127	* Migrate tasks one-by-one until @from is empty. This fails iff
				128	* ->can_attach() fails.
				129	*/
				130	do {
Tejun Heo	bc2fb7e	2017-05-15 09:34:01 -0400	[diff] [blame]	131	css_task_iter_start(&from->self, 0, &it);
Prateek Sood	116d2f7	2017-12-19 12:56:57 +0530	[diff] [blame]	132
				133	do {
				134	task = css_task_iter_next(&it);
				135	} while (task && (task->flags & PF_EXITING));
				136
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	137	if (task)
				138	get_task_struct(task);
				139	css_task_iter_end(&it);
				140
				141	if (task) {
Tejun Heo	bfc2cf6	2017-01-15 19:03:41 -0500	[diff] [blame]	142	ret = cgroup_migrate(task, false, &mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	143	if (!ret)
Steven Rostedt (VMware)	e4f8d81	2018-07-09 17:48:54 -0400	[diff] [blame]	144	TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	145	put_task_struct(task);
				146	}
				147	} while (task && !ret);
				148	out_err:
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	149	cgroup_migrate_finish(&mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	150	percpu_up_write(&cgroup_threadgroup_rwsem);
				151	mutex_unlock(&cgroup_mutex);
				152	return ret;
				153	}
				154
				155	/*
				156	* Stuff for reading the 'tasks'/'procs' files.
				157	*
				158	* Reading this file can return large amounts of data if a cgroup has
				159	* lots of attached tasks. So it may need several calls to read(),
				160	* but we cannot guarantee that the information we produce is correct
				161	* unless we produce it entirely atomically.
				162	*
				163	*/
				164
				165	/* which pidlist file are we talking about? */
				166	enum cgroup_filetype {
				167	CGROUP_FILE_PROCS,
				168	CGROUP_FILE_TASKS,
				169	};
				170
				171	/*
				172	* A pidlist is a list of pids that virtually represents the contents of one
				173	* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
				174	* a pair (one each for procs, tasks) for each pid namespace that's relevant
				175	* to the cgroup.
				176	*/
				177	struct cgroup_pidlist {
				178	/*
				179	* used to find which pidlist is wanted. doesn't change as long as
				180	* this particular list stays in the list.
				181	*/
				182	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
				183	/* array of xids */
				184	pid_t *list;
				185	/* how many elements the above list has */
				186	int length;
				187	/* each of these stored in a list by its cgroup */
				188	struct list_head links;
				189	/* pointer to the cgroup we belong to, for list removal purposes */
				190	struct cgroup *owner;
				191	/* for delayed destruction */
				192	struct delayed_work destroy_dwork;
				193	};
				194
				195	/*
				196	* The following two functions "fix" the issue where there are more pids
				197	* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
				198	* TODO: replace with a kernel-wide solution to this problem
				199	*/
				200	#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
				201	static void *pidlist_allocate(int count)
				202	{
				203	if (PIDLIST_TOO_LARGE(count))
Kees Cook	42bc47b	2018-06-12 14:27:11 -0700	[diff] [blame]	204	return vmalloc(array_size(count, sizeof(pid_t)));
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	205	else
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	206	return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	207	}
				208
				209	static void pidlist_free(void *p)
				210	{
				211	kvfree(p);
				212	}
				213
				214	/*
				215	* Used to destroy all pidlists lingering waiting for destroy timer. None
				216	* should be left afterwards.
				217	*/
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	218	void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	219	{
				220	struct cgroup_pidlist l, tmp_l;
				221
				222	mutex_lock(&cgrp->pidlist_mutex);
				223	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
				224	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
				225	mutex_unlock(&cgrp->pidlist_mutex);
				226
				227	flush_workqueue(cgroup_pidlist_destroy_wq);
				228	BUG_ON(!list_empty(&cgrp->pidlists));
				229	}
				230
				231	static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
				232	{
				233	struct delayed_work *dwork = to_delayed_work(work);
				234	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
				235	destroy_dwork);
				236	struct cgroup_pidlist *tofree = NULL;
				237
				238	mutex_lock(&l->owner->pidlist_mutex);
				239
				240	/*
				241	* Destroy iff we didn't get queued again. The state won't change
				242	* as destroy_dwork can only be queued while locked.
				243	*/
				244	if (!delayed_work_pending(dwork)) {
				245	list_del(&l->links);
				246	pidlist_free(l->list);
				247	put_pid_ns(l->key.ns);
				248	tofree = l;
				249	}
				250
				251	mutex_unlock(&l->owner->pidlist_mutex);
				252	kfree(tofree);
				253	}
				254
				255	/*
				256	* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
				257	* Returns the number of unique elements.
				258	*/
				259	static int pidlist_uniq(pid_t *list, int length)
				260	{
				261	int src, dest = 1;
				262
				263	/*
				264	* we presume the 0th element is unique, so i starts at 1. trivial
				265	* edge cases first; no work needs to be done for either
				266	*/
				267	if (length == 0 \|\| length == 1)
				268	return length;
				269	/* src and dest walk down the list; dest counts unique elements */
				270	for (src = 1; src < length; src++) {
				271	/* find next unique element */
				272	while (list[src] == list[src-1]) {
				273	src++;
				274	if (src == length)
				275	goto after;
				276	}
				277	/* dest always points to where the next unique element goes */
				278	list[dest] = list[src];
				279	dest++;
				280	}
				281	after:
				282	return dest;
				283	}
				284
				285	/*
				286	* The two pid files - task and cgroup.procs - guaranteed that the result
				287	* is sorted, which forced this whole pidlist fiasco. As pid order is
				288	* different per namespace, each namespace needs differently sorted list,
				289	* making it impossible to use, for example, single rbtree of member tasks
				290	* sorted by task pointer. As pidlists can be fairly large, allocating one
				291	* per open file is dangerous, so cgroup had to implement shared pool of
				292	* pidlists keyed by cgroup and namespace.
				293	*/
				294	static int cmppid(const void a, const void b)
				295	{
				296	return (pid_t )a - (pid_t )b;
				297	}
				298
				299	static struct cgroup_pidlist cgroup_pidlist_find(struct cgroup cgrp,
				300	enum cgroup_filetype type)
				301	{
				302	struct cgroup_pidlist *l;
				303	/* don't need task_nsproxy() if we're looking at ourself */
				304	struct pid_namespace *ns = task_active_pid_ns(current);
				305
				306	lockdep_assert_held(&cgrp->pidlist_mutex);
				307
				308	list_for_each_entry(l, &cgrp->pidlists, links)
				309	if (l->key.type == type && l->key.ns == ns)
				310	return l;
				311	return NULL;
				312	}
				313
				314	/*
				315	* find the appropriate pidlist for our purpose (given procs vs tasks)
				316	* returns with the lock on that pidlist already held, and takes care
				317	* of the use count, or returns NULL with no locks held if we're out of
				318	* memory.
				319	*/
				320	static struct cgroup_pidlist cgroup_pidlist_find_create(struct cgroup cgrp,
				321	enum cgroup_filetype type)
				322	{
				323	struct cgroup_pidlist *l;
				324
				325	lockdep_assert_held(&cgrp->pidlist_mutex);
				326
				327	l = cgroup_pidlist_find(cgrp, type);
				328	if (l)
				329	return l;
				330
				331	/* entry not found; create a new one */
				332	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
				333	if (!l)
				334	return l;
				335
				336	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
				337	l->key.type = type;
				338	/* don't need task_nsproxy() if we're looking at ourself */
				339	l->key.ns = get_pid_ns(task_active_pid_ns(current));
				340	l->owner = cgrp;
				341	list_add(&l->links, &cgrp->pidlists);
				342	return l;
				343	}
				344
				345	/**
				346	* cgroup_task_count - count the number of tasks in a cgroup.
				347	* @cgrp: the cgroup in question
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	348	*/
Waiman Long	a28f8f5	2017-06-13 17:18:02 -0400	[diff] [blame]	349	int cgroup_task_count(const struct cgroup *cgrp)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	350	{
				351	int count = 0;
				352	struct cgrp_cset_link *link;
				353
				354	spin_lock_irq(&css_set_lock);
				355	list_for_each_entry(link, &cgrp->cset_links, cset_link)
Waiman Long	73a7242	2017-06-13 17:18:01 -0400	[diff] [blame]	356	count += link->cset->nr_tasks;
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	357	spin_unlock_irq(&css_set_lock);
				358	return count;
				359	}
				360
				361	/*
				362	* Load a cgroup's pidarray with either procs' tgids or tasks' pids
				363	*/
				364	static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
				365	struct cgroup_pidlist **lp)
				366	{
				367	pid_t *array;
				368	int length;
				369	int pid, n = 0; /* used for populating the array */
				370	struct css_task_iter it;
				371	struct task_struct *tsk;
				372	struct cgroup_pidlist *l;
				373
				374	lockdep_assert_held(&cgrp->pidlist_mutex);
				375
				376	/*
				377	* If cgroup gets more users after we read count, we won't have
				378	* enough space - tough. This race is indistinguishable to the
				379	* caller from the case that the additional cgroup users didn't
				380	* show up until sometime later on.
				381	*/
				382	length = cgroup_task_count(cgrp);
				383	array = pidlist_allocate(length);
				384	if (!array)
				385	return -ENOMEM;
				386	/* now, populate the array */
Tejun Heo	bc2fb7e	2017-05-15 09:34:01 -0400	[diff] [blame]	387	css_task_iter_start(&cgrp->self, 0, &it);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	388	while ((tsk = css_task_iter_next(&it))) {
				389	if (unlikely(n == length))
				390	break;
				391	/* get tgid or pid for procs or tasks file respectively */
				392	if (type == CGROUP_FILE_PROCS)
				393	pid = task_tgid_vnr(tsk);
				394	else
				395	pid = task_pid_vnr(tsk);
				396	if (pid > 0) /* make sure to only use valid results */
				397	array[n++] = pid;
				398	}
				399	css_task_iter_end(&it);
				400	length = n;
				401	/* now sort & (if procs) strip out duplicates */
				402	sort(array, length, sizeof(pid_t), cmppid, NULL);
				403	if (type == CGROUP_FILE_PROCS)
				404	length = pidlist_uniq(array, length);
				405
				406	l = cgroup_pidlist_find_create(cgrp, type);
				407	if (!l) {
				408	pidlist_free(array);
				409	return -ENOMEM;
				410	}
				411
				412	/* store array, freeing old if necessary */
				413	pidlist_free(l->list);
				414	l->list = array;
				415	l->length = length;
				416	*lp = l;
				417	return 0;
				418	}
				419
				420	/*
				421	* seq_file methods for the tasks/procs files. The seq_file position is the
				422	* next pid to display; the seq_file iterator is a pointer to the pid
				423	* in the cgroup->l->list array.
				424	*/
				425
				426	static void cgroup_pidlist_start(struct seq_file s, loff_t *pos)
				427	{
				428	/*
				429	* Initially we receive a position value that corresponds to
				430	* one more than the last pid shown (or 0 on the first call or
				431	* after a seek to the start). Use a binary-search to find the
				432	* next pid to display, if any
				433	*/
				434	struct kernfs_open_file *of = s->private;
				435	struct cgroup *cgrp = seq_css(s)->cgroup;
				436	struct cgroup_pidlist *l;
				437	enum cgroup_filetype type = seq_cft(s)->private;
				438	int index = 0, pid = *pos;
				439	int *iter, ret;
				440
				441	mutex_lock(&cgrp->pidlist_mutex);
				442
				443	/*
				444	* !NULL @of->priv indicates that this isn't the first start()
				445	* after open. If the matching pidlist is around, we can use that.
				446	* Look for it. Note that @of->priv can't be used directly. It
				447	* could already have been destroyed.
				448	*/
				449	if (of->priv)
				450	of->priv = cgroup_pidlist_find(cgrp, type);
				451
				452	/*
				453	* Either this is the first start() after open or the matching
				454	* pidlist has been destroyed inbetween. Create a new one.
				455	*/
				456	if (!of->priv) {
				457	ret = pidlist_array_load(cgrp, type,
				458	(struct cgroup_pidlist **)&of->priv);
				459	if (ret)
				460	return ERR_PTR(ret);
				461	}
				462	l = of->priv;
				463
				464	if (pid) {
				465	int end = l->length;
				466
				467	while (index < end) {
				468	int mid = (index + end) / 2;
				469	if (l->list[mid] == pid) {
				470	index = mid;
				471	break;
				472	} else if (l->list[mid] <= pid)
				473	index = mid + 1;
				474	else
				475	end = mid;
				476	}
				477	}
				478	/* If we're off the end of the array, we're done */
				479	if (index >= l->length)
				480	return NULL;
				481	/* Update the abstract position to be the actual pid that we found */
				482	iter = l->list + index;
				483	pos = iter;
				484	return iter;
				485	}
				486
				487	static void cgroup_pidlist_stop(struct seq_file s, void v)
				488	{
				489	struct kernfs_open_file *of = s->private;
				490	struct cgroup_pidlist *l = of->priv;
				491
				492	if (l)
				493	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
				494	CGROUP_PIDLIST_DESTROY_DELAY);
				495	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
				496	}
				497
				498	static void cgroup_pidlist_next(struct seq_file s, void v, loff_t pos)
				499	{
				500	struct kernfs_open_file *of = s->private;
				501	struct cgroup_pidlist *l = of->priv;
				502	pid_t *p = v;
				503	pid_t *end = l->list + l->length;
				504	/*
				505	* Advance to the next pid in the array. If this goes off the
				506	* end, we're done
				507	*/
				508	p++;
				509	if (p >= end) {
				510	return NULL;
				511	} else {
				512	pos = p;
				513	return p;
				514	}
				515	}
				516
				517	static int cgroup_pidlist_show(struct seq_file s, void v)
				518	{
				519	seq_printf(s, "%d\n", (int )v);
				520
				521	return 0;
				522	}
				523
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	524	static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
				525	char *buf, size_t nbytes, loff_t off,
				526	bool threadgroup)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	527	{
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	528	struct cgroup *cgrp;
				529	struct task_struct *task;
				530	const struct cred cred, tcred;
				531	ssize_t ret;
				532
				533	cgrp = cgroup_kn_lock_live(of->kn, false);
				534	if (!cgrp)
				535	return -ENODEV;
				536
				537	task = cgroup_procs_write_start(buf, threadgroup);
				538	ret = PTR_ERR_OR_ZERO(task);
				539	if (ret)
				540	goto out_unlock;
				541
				542	/*
				543	* Even if we're attaching all tasks in the thread group, we only
				544	* need to check permissions on one of them.
				545	*/
				546	cred = current_cred();
				547	tcred = get_task_cred(task);
				548	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
				549	!uid_eq(cred->euid, tcred->uid) &&
				550	!uid_eq(cred->euid, tcred->suid))
				551	ret = -EACCES;
				552	put_cred(tcred);
				553	if (ret)
				554	goto out_finish;
				555
				556	ret = cgroup_attach_task(cgrp, task, threadgroup);
				557
				558	out_finish:
				559	cgroup_procs_write_finish(task);
				560	out_unlock:
				561	cgroup_kn_unlock(of->kn);
				562
				563	return ret ?: nbytes;
				564	}
				565
				566	static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
				567	char *buf, size_t nbytes, loff_t off)
				568	{
				569	return __cgroup1_procs_write(of, buf, nbytes, off, true);
				570	}
				571
				572	static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
				573	char *buf, size_t nbytes, loff_t off)
				574	{
				575	return __cgroup1_procs_write(of, buf, nbytes, off, false);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	576	}
				577
				578	static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
				579	char *buf, size_t nbytes, loff_t off)
				580	{
				581	struct cgroup *cgrp;
				582
				583	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
				584
				585	cgrp = cgroup_kn_lock_live(of->kn, false);
				586	if (!cgrp)
				587	return -ENODEV;
				588	spin_lock(&release_agent_path_lock);
				589	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
				590	sizeof(cgrp->root->release_agent_path));
				591	spin_unlock(&release_agent_path_lock);
				592	cgroup_kn_unlock(of->kn);
				593	return nbytes;
				594	}
				595
				596	static int cgroup_release_agent_show(struct seq_file seq, void v)
				597	{
				598	struct cgroup *cgrp = seq_css(seq)->cgroup;
				599
				600	spin_lock(&release_agent_path_lock);
				601	seq_puts(seq, cgrp->root->release_agent_path);
				602	spin_unlock(&release_agent_path_lock);
				603	seq_putc(seq, '\n');
				604	return 0;
				605	}
				606
				607	static int cgroup_sane_behavior_show(struct seq_file seq, void v)
				608	{
				609	seq_puts(seq, "0\n");
				610	return 0;
				611	}
				612
				613	static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
				614	struct cftype *cft)
				615	{
				616	return notify_on_release(css->cgroup);
				617	}
				618
				619	static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
				620	struct cftype *cft, u64 val)
				621	{
				622	if (val)
				623	set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				624	else
				625	clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				626	return 0;
				627	}
				628
				629	static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				630	struct cftype *cft)
				631	{
				632	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				633	}
				634
				635	static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				636	struct cftype *cft, u64 val)
				637	{
				638	if (val)
				639	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				640	else
				641	clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				642	return 0;
				643	}
				644
				645	/* cgroup core interface files for the legacy hierarchies */
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	646	struct cftype cgroup1_base_files[] = {
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	647	{
				648	.name = "cgroup.procs",
				649	.seq_start = cgroup_pidlist_start,
				650	.seq_next = cgroup_pidlist_next,
				651	.seq_stop = cgroup_pidlist_stop,
				652	.seq_show = cgroup_pidlist_show,
				653	.private = CGROUP_FILE_PROCS,
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	654	.write = cgroup1_procs_write,
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	655	},
				656	{
				657	.name = "cgroup.clone_children",
				658	.read_u64 = cgroup_clone_children_read,
				659	.write_u64 = cgroup_clone_children_write,
				660	},
				661	{
				662	.name = "cgroup.sane_behavior",
				663	.flags = CFTYPE_ONLY_ON_ROOT,
				664	.seq_show = cgroup_sane_behavior_show,
				665	},
				666	{
				667	.name = "tasks",
				668	.seq_start = cgroup_pidlist_start,
				669	.seq_next = cgroup_pidlist_next,
				670	.seq_stop = cgroup_pidlist_stop,
				671	.seq_show = cgroup_pidlist_show,
				672	.private = CGROUP_FILE_TASKS,
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	673	.write = cgroup1_tasks_write,
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	674	},
				675	{
				676	.name = "notify_on_release",
				677	.read_u64 = cgroup_read_notify_on_release,
				678	.write_u64 = cgroup_write_notify_on_release,
				679	},
				680	{
				681	.name = "release_agent",
				682	.flags = CFTYPE_ONLY_ON_ROOT,
				683	.seq_show = cgroup_release_agent_show,
				684	.write = cgroup_release_agent_write,
				685	.max_write_len = PATH_MAX - 1,
				686	},
				687	{ } /* terminate */
				688	};
				689
				690	/* Display information about each subsystem and each hierarchy */
Christoph Hellwig	3f3942a	2018-05-15 15:57:23 +0200	[diff] [blame]	691	int proc_cgroupstats_show(struct seq_file m, void v)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	692	{
				693	struct cgroup_subsys *ss;
				694	int i;
				695
				696	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
				697	/*
				698	* ideally we don't want subsystems moving around while we do this.
				699	* cgroup_mutex is also necessary to guarantee an atomic snapshot of
				700	* subsys/hierarchy state.
				701	*/
				702	mutex_lock(&cgroup_mutex);
				703
				704	for_each_subsys(ss, i)
				705	seq_printf(m, "%s\t%d\t%d\t%d\n",
				706	ss->legacy_name, ss->root->hierarchy_id,
				707	atomic_read(&ss->root->nr_cgrps),
				708	cgroup_ssid_enabled(i));
				709
				710	mutex_unlock(&cgroup_mutex);
				711	return 0;
				712	}
				713
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	714	/**
				715	* cgroupstats_build - build and fill cgroupstats
				716	* @stats: cgroupstats to fill information into
				717	* @dentry: A dentry entry belonging to the cgroup for which stats have
				718	* been requested.
				719	*
				720	* Build and fill cgroupstats so that taskstats can export it to user
				721	* space.
				722	*/
				723	int cgroupstats_build(struct cgroupstats stats, struct dentry dentry)
				724	{
				725	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
				726	struct cgroup *cgrp;
				727	struct css_task_iter it;
				728	struct task_struct *tsk;
				729
				730	/* it should be kernfs_node belonging to cgroupfs and is a directory */
				731	if (dentry->d_sb->s_type != &cgroup_fs_type \|\| !kn \|\|
				732	kernfs_type(kn) != KERNFS_DIR)
				733	return -EINVAL;
				734
				735	mutex_lock(&cgroup_mutex);
				736
				737	/*
				738	* We aren't being called from kernfs and there's no guarantee on
				739	* @kn->priv's validity. For this and css_tryget_online_from_dir(),
				740	* @kn->priv is RCU safe. Let's do the RCU dancing.
				741	*/
				742	rcu_read_lock();
Tejun Heo	e0aed7c	2016-12-27 14:49:09 -0500	[diff] [blame]	743	cgrp = rcu_dereference((void __rcu __force *)&kn->priv);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	744	if (!cgrp \|\| cgroup_is_dead(cgrp)) {
				745	rcu_read_unlock();
				746	mutex_unlock(&cgroup_mutex);
				747	return -ENOENT;
				748	}
				749	rcu_read_unlock();
				750
Tejun Heo	bc2fb7e	2017-05-15 09:34:01 -0400	[diff] [blame]	751	css_task_iter_start(&cgrp->self, 0, &it);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	752	while ((tsk = css_task_iter_next(&it))) {
				753	switch (tsk->state) {
				754	case TASK_RUNNING:
				755	stats->nr_running++;
				756	break;
				757	case TASK_INTERRUPTIBLE:
				758	stats->nr_sleeping++;
				759	break;
				760	case TASK_UNINTERRUPTIBLE:
				761	stats->nr_uninterruptible++;
				762	break;
				763	case TASK_STOPPED:
				764	stats->nr_stopped++;
				765	break;
				766	default:
				767	if (delayacct_is_task_waiting_on_io(tsk))
				768	stats->nr_io_wait++;
				769	break;
				770	}
				771	}
				772	css_task_iter_end(&it);
				773
				774	mutex_unlock(&cgroup_mutex);
				775	return 0;
				776	}
				777
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	778	void cgroup1_check_for_release(struct cgroup *cgrp)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	779	{
				780	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
				781	!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
				782	schedule_work(&cgrp->release_agent_work);
				783	}
				784
				785	/*
				786	* Notify userspace when a cgroup is released, by running the
				787	* configured release agent with the name of the cgroup (path
				788	* relative to the root of cgroup file system) as the argument.
				789	*
				790	* Most likely, this user command will try to rmdir this cgroup.
				791	*
				792	* This races with the possibility that some other task will be
				793	* attached to this cgroup before it is removed, or that some other
				794	* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
				795	* The presumed 'rmdir' will fail quietly if this cgroup is no longer
				796	* unused, and this cgroup will be reprieved from its death sentence,
				797	* to continue to serve a useful existence. Next time it's released,
				798	* we will get notified again, if it still has 'notify_on_release' set.
				799	*
				800	* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
				801	* means only wait until the task is successfully execve()'d. The
				802	* separate release agent task is forked by call_usermodehelper(),
				803	* then control in this thread returns here, without waiting for the
				804	* release agent task. We don't bother to wait because the caller of
				805	* this routine has no use for the exit status of the release agent
				806	* task, so no sense holding our caller up for that.
				807	*/
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	808	void cgroup1_release_agent(struct work_struct *work)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	809	{
				810	struct cgroup *cgrp =
				811	container_of(work, struct cgroup, release_agent_work);
				812	char pathbuf = NULL, agentbuf = NULL;
				813	char argv[3], envp[3];
				814	int ret;
				815
				816	mutex_lock(&cgroup_mutex);
				817
				818	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
				819	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
				820	if (!pathbuf \|\| !agentbuf)
				821	goto out;
				822
				823	spin_lock_irq(&css_set_lock);
				824	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
				825	spin_unlock_irq(&css_set_lock);
				826	if (ret < 0 \|\| ret >= PATH_MAX)
				827	goto out;
				828
				829	argv[0] = agentbuf;
				830	argv[1] = pathbuf;
				831	argv[2] = NULL;
				832
				833	/* minimal command environment */
				834	envp[0] = "HOME=/";
				835	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
				836	envp[2] = NULL;
				837
				838	mutex_unlock(&cgroup_mutex);
				839	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
				840	goto out_free;
				841	out:
				842	mutex_unlock(&cgroup_mutex);
				843	out_free:
				844	kfree(agentbuf);
				845	kfree(pathbuf);
				846	}
				847
				848	/*
				849	* cgroup_rename - Only allow simple rename of directories in place.
				850	*/
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	851	static int cgroup1_rename(struct kernfs_node kn, struct kernfs_node new_parent,
				852	const char *new_name_str)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	853	{
				854	struct cgroup *cgrp = kn->priv;
				855	int ret;
				856
				857	if (kernfs_type(kn) != KERNFS_DIR)
				858	return -ENOTDIR;
				859	if (kn->parent != new_parent)
				860	return -EIO;
				861
				862	/*
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	863	* We're gonna grab cgroup_mutex which nests outside kernfs
				864	* active_ref. kernfs_rename() doesn't require active_ref
				865	* protection. Break them before grabbing cgroup_mutex.
				866	*/
				867	kernfs_break_active_protection(new_parent);
				868	kernfs_break_active_protection(kn);
				869
				870	mutex_lock(&cgroup_mutex);
				871
				872	ret = kernfs_rename(kn, new_parent, new_name_str);
				873	if (!ret)
Steven Rostedt (VMware)	e4f8d81	2018-07-09 17:48:54 -0400	[diff] [blame]	874	TRACE_CGROUP_PATH(rename, cgrp);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	875
				876	mutex_unlock(&cgroup_mutex);
				877
				878	kernfs_unbreak_active_protection(kn);
				879	kernfs_unbreak_active_protection(new_parent);
				880	return ret;
				881	}
				882
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	883	static int cgroup1_show_options(struct seq_file seq, struct kernfs_root kf_root)
				884	{
				885	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
				886	struct cgroup_subsys *ss;
				887	int ssid;
				888
				889	for_each_subsys(ss, ssid)
				890	if (root->subsys_mask & (1 << ssid))
				891	seq_show_option(seq, ss->legacy_name, NULL);
				892	if (root->flags & CGRP_ROOT_NOPREFIX)
				893	seq_puts(seq, ",noprefix");
				894	if (root->flags & CGRP_ROOT_XATTR)
				895	seq_puts(seq, ",xattr");
Waiman Long	e1cba4b	2017-08-17 15:33:09 -0400	[diff] [blame]	896	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
				897	seq_puts(seq, ",cpuset_v2_mode");
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	898
				899	spin_lock(&release_agent_path_lock);
				900	if (strlen(root->release_agent_path))
				901	seq_show_option(seq, "release_agent",
				902	root->release_agent_path);
				903	spin_unlock(&release_agent_path_lock);
				904
				905	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
				906	seq_puts(seq, ",clone_children");
				907	if (strlen(root->name))
				908	seq_show_option(seq, "name", root->name);
				909	return 0;
				910	}
				911
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	912	enum cgroup1_param {
				913	Opt_all,
				914	Opt_clone_children,
				915	Opt_cpuset_v2_mode,
				916	Opt_name,
				917	Opt_none,
				918	Opt_noprefix,
				919	Opt_release_agent,
				920	Opt_xattr,
				921	};
				922
				923	static const struct fs_parameter_spec cgroup1_param_specs[] = {
				924	fsparam_flag ("all", Opt_all),
				925	fsparam_flag ("clone_children", Opt_clone_children),
				926	fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
				927	fsparam_string("name", Opt_name),
				928	fsparam_flag ("none", Opt_none),
				929	fsparam_flag ("noprefix", Opt_noprefix),
				930	fsparam_string("release_agent", Opt_release_agent),
				931	fsparam_flag ("xattr", Opt_xattr),
				932	{}
				933	};
				934
				935	const struct fs_parameter_description cgroup1_fs_parameters = {
				936	.name = "cgroup1",
				937	.specs = cgroup1_param_specs,
				938	};
				939
				940	int cgroup1_parse_param(struct fs_context fc, struct fs_parameter param)
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	941	{
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	942	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	943	struct cgroup_subsys *ss;
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	944	struct fs_parse_result result;
				945	int opt, i;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	946
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	947	opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
				948	if (opt == -ENOPARAM) {
				949	if (strcmp(param->key, "source") == 0) {
				950	fc->source = param->string;
				951	param->string = NULL;
				952	return 0;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	953	}
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	954	for_each_subsys(ss, i) {
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	955	if (strcmp(param->key, ss->legacy_name))
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	956	continue;
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	957	ctx->subsys_mask \|= (1 << i);
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	958	return 0;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	959	}
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	960	return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
				961	}
				962	if (opt < 0)
				963	return opt;
				964
				965	switch (opt) {
				966	case Opt_none:
				967	/* Explicitly have no subsystems */
				968	ctx->none = true;
				969	break;
				970	case Opt_all:
				971	ctx->all_ss = true;
				972	break;
				973	case Opt_noprefix:
				974	ctx->flags \|= CGRP_ROOT_NOPREFIX;
				975	break;
				976	case Opt_clone_children:
				977	ctx->cpuset_clone_children = true;
				978	break;
				979	case Opt_cpuset_v2_mode:
				980	ctx->flags \|= CGRP_ROOT_CPUSET_V2_MODE;
				981	break;
				982	case Opt_xattr:
				983	ctx->flags \|= CGRP_ROOT_XATTR;
				984	break;
				985	case Opt_release_agent:
				986	/* Specifying two release agents is forbidden */
				987	if (ctx->release_agent)
				988	return cg_invalf(fc, "cgroup1: release_agent respecified");
				989	ctx->release_agent = param->string;
				990	param->string = NULL;
				991	break;
				992	case Opt_name:
				993	/* blocked by boot param? */
				994	if (cgroup_no_v1_named)
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	995	return -ENOENT;
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	996	/* Can't specify an empty name */
				997	if (!param->size)
				998	return cg_invalf(fc, "cgroup1: Empty name");
				999	if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
				1000	return cg_invalf(fc, "cgroup1: Name too long");
				1001	/* Must match [\w.-]+ */
				1002	for (i = 0; i < param->size; i++) {
				1003	char c = param->string[i];
				1004	if (isalnum(c))
				1005	continue;
				1006	if ((c == '.') \|\| (c == '-') \|\| (c == '_'))
				1007	continue;
				1008	return cg_invalf(fc, "cgroup1: Invalid name");
				1009	}
				1010	/* Specifying two names is forbidden */
				1011	if (ctx->name)
				1012	return cg_invalf(fc, "cgroup1: name respecified");
				1013	ctx->name = param->string;
				1014	param->string = NULL;
				1015	break;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1016	}
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1017	return 0;
				1018	}
				1019
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1020	static int check_cgroupfs_options(struct fs_context *fc)
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1021	{
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1022	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1023	u16 mask = U16_MAX;
				1024	u16 enabled = 0;
				1025	struct cgroup_subsys *ss;
				1026	int i;
				1027
				1028	#ifdef CONFIG_CPUSETS
				1029	mask = ~((u16)1 << cpuset_cgrp_id);
				1030	#endif
				1031	for_each_subsys(ss, i)
				1032	if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
				1033	enabled \|= 1 << i;
				1034
				1035	ctx->subsys_mask &= enabled;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1036
				1037	/*
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1038	* In absense of 'none', 'name=' or subsystem name options,
				1039	* let's default to 'all'.
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1040	*/
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1041	if (!ctx->subsys_mask && !ctx->none && !ctx->name)
				1042	ctx->all_ss = true;
				1043
				1044	if (ctx->all_ss) {
				1045	/* Mutually exclusive option 'all' + subsystem name */
				1046	if (ctx->subsys_mask)
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1047	return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1048	/* 'all' => select all the subsystems */
				1049	ctx->subsys_mask = enabled;
				1050	}
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1051
				1052	/*
				1053	* We either have to specify by name or by subsystems. (So all
				1054	* empty hierarchies must have a name).
				1055	*/
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1056	if (!ctx->subsys_mask && !ctx->name)
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1057	return cg_invalf(fc, "cgroup1: Need name or subsystem set");
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1058
				1059	/*
				1060	* Option noprefix was introduced just for backward compatibility
				1061	* with the old cpuset, so we allow noprefix only if mounting just
				1062	* the cpuset subsystem.
				1063	*/
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1064	if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1065	return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1066
				1067	/* Can't specify "none" and some subsystems */
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1068	if (ctx->subsys_mask && ctx->none)
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1069	return cg_invalf(fc, "cgroup1: none used incorrectly");
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1070
				1071	return 0;
				1072	}
				1073
Al Viro	9012962	2019-01-05 00:38:03 -0500	[diff] [blame]	1074	int cgroup1_reconfigure(struct fs_context *fc)
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1075	{
Al Viro	9012962	2019-01-05 00:38:03 -0500	[diff] [blame]	1076	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
				1077	struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1078	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
Al Viro	9012962	2019-01-05 00:38:03 -0500	[diff] [blame]	1079	int ret = 0;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1080	u16 added_mask, removed_mask;
				1081
				1082	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
				1083
				1084	/* See what subsystems are wanted */
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1085	ret = check_cgroupfs_options(fc);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1086	if (ret)
				1087	goto out_unlock;
				1088
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1089	if (ctx->subsys_mask != root->subsys_mask \|\| ctx->release_agent)
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1090	pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
				1091	task_tgid_nr(current), current->comm);
				1092
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1093	added_mask = ctx->subsys_mask & ~root->subsys_mask;
				1094	removed_mask = root->subsys_mask & ~ctx->subsys_mask;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1095
				1096	/* Don't allow flags or name to change at remount */
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1097	if ((ctx->flags ^ root->flags) \|\|
				1098	(ctx->name && strcmp(ctx->name, root->name))) {
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1099	cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1100	ctx->flags, ctx->name ?: "", root->flags, root->name);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1101	ret = -EINVAL;
				1102	goto out_unlock;
				1103	}
				1104
				1105	/* remounting is not allowed for populated hierarchies */
				1106	if (!list_empty(&root->cgrp.self.children)) {
				1107	ret = -EBUSY;
				1108	goto out_unlock;
				1109	}
				1110
				1111	ret = rebind_subsystems(root, added_mask);
				1112	if (ret)
				1113	goto out_unlock;
				1114
				1115	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
				1116
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1117	if (ctx->release_agent) {
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1118	spin_lock(&release_agent_path_lock);
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1119	strcpy(root->release_agent_path, ctx->release_agent);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1120	spin_unlock(&release_agent_path_lock);
				1121	}
				1122
				1123	trace_cgroup_remount(root);
				1124
				1125	out_unlock:
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1126	mutex_unlock(&cgroup_mutex);
				1127	return ret;
				1128	}
				1129
				1130	struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
				1131	.rename = cgroup1_rename,
				1132	.show_options = cgroup1_show_options,
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1133	.mkdir = cgroup_mkdir,
				1134	.rmdir = cgroup_rmdir,
				1135	.show_path = cgroup_show_path,
				1136	};
				1137
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1138	/*
				1139	* The guts of cgroup1 mount - find or create cgroup_root to use.
				1140	* Called with cgroup_mutex held; returns 0 on success, -E... on
				1141	* error and positive - in case when the candidate is busy dying.
				1142	* On success it stashes a reference to cgroup_root into given
				1143	* cgroup_fs_context; that reference is NOT counting towards the
				1144	* cgroup_root refcount.
				1145	*/
				1146	static int cgroup1_root_to_use(struct fs_context *fc)
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1147	{
Al Viro	7feeef5	2019-01-16 21:23:02 -0500	[diff] [blame]	1148	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1149	struct cgroup_root *root;
				1150	struct cgroup_subsys *ss;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1151	int i, ret;
				1152
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1153	/* First find the desired set of subsystems */
Al Viro	8d2451f	2019-01-17 00:15:11 -0500	[diff] [blame]	1154	ret = check_cgroupfs_options(fc);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1155	if (ret)
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1156	return ret;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1157
				1158	/*
				1159	* Destruction of cgroup root is asynchronous, so subsystems may
				1160	* still be dying after the previous unmount. Let's drain the
				1161	* dying subsystems. We just need to ensure that the ones
				1162	* unmounted previously finish dying and don't care about new ones
				1163	* starting. Testing ref liveliness is good enough.
				1164	*/
				1165	for_each_subsys(ss, i) {
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1166	if (!(ctx->subsys_mask & (1 << i)) \|\|
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1167	ss->root == &cgrp_dfl_root)
				1168	continue;
				1169
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1170	if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
				1171	return 1; /* restart */
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1172	cgroup_put(&ss->root->cgrp);
				1173	}
				1174
				1175	for_each_root(root) {
				1176	bool name_match = false;
				1177
				1178	if (root == &cgrp_dfl_root)
				1179	continue;
				1180
				1181	/*
				1182	* If we asked for a name then it must match. Also, if
				1183	* name matches but sybsys_mask doesn't, we should fail.
				1184	* Remember whether name matched.
				1185	*/
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1186	if (ctx->name) {
				1187	if (strcmp(ctx->name, root->name))
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1188	continue;
				1189	name_match = true;
				1190	}
				1191
				1192	/*
				1193	* If we asked for subsystems (or explicitly for no
				1194	* subsystems) then they must match.
				1195	*/
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1196	if ((ctx->subsys_mask \|\| ctx->none) &&
				1197	(ctx->subsys_mask != root->subsys_mask)) {
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1198	if (!name_match)
				1199	continue;
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1200	return -EBUSY;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1201	}
				1202
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1203	if (root->flags ^ ctx->flags)
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1204	pr_warn("new mount options do not match the existing superblock, will be ignored\n");
				1205
Al Viro	cf6299b1	2019-01-17 02:25:51 -0500	[diff] [blame]	1206	ctx->root = root;
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1207	return 0;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1208	}
				1209
				1210	/*
				1211	* No such thing, create a new one. name= matching without subsys
				1212	* specification is allowed for already existing hierarchies but we
				1213	* can't create new one without subsys specification.
				1214	*/
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1215	if (!ctx->subsys_mask && !ctx->none)
				1216	return cg_invalf(fc, "cgroup1: No subsys list or none specified");
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1217
				1218	/* Hierarchies may only be created in the initial cgroup namespace. */
Al Viro	cca8f32	2019-01-17 10:14:26 -0500	[diff] [blame]	1219	if (ctx->ns != &init_cgroup_ns)
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1220	return -EPERM;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1221
				1222	root = kzalloc(sizeof(*root), GFP_KERNEL);
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1223	if (!root)
				1224	return -ENOMEM;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1225
Al Viro	cf6299b1	2019-01-17 02:25:51 -0500	[diff] [blame]	1226	ctx->root = root;
				1227	init_cgroup_root(ctx);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1228
Al Viro	f5dfb53	2019-01-16 23:42:38 -0500	[diff] [blame]	1229	ret = cgroup_setup_root(root, ctx->subsys_mask);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1230	if (ret)
				1231	cgroup_free_root(root);
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1232	return ret;
				1233	}
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1234
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1235	int cgroup1_get_tree(struct fs_context *fc)
				1236	{
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1237	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
				1238	int ret;
				1239
				1240	/* Check if the caller has permission to mount. */
Al Viro	cca8f32	2019-01-17 10:14:26 -0500	[diff] [blame]	1241	if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1242	return -EPERM;
				1243
				1244	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
				1245
				1246	ret = cgroup1_root_to_use(fc);
				1247	if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
				1248	ret = 1; /* restart */
				1249
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1250	mutex_unlock(&cgroup_mutex);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1251
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1252	if (!ret)
Al Viro	cca8f32	2019-01-17 10:14:26 -0500	[diff] [blame]	1253	ret = cgroup_do_get_tree(fc);
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1254
				1255	if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
Al Viro	71d883c	2019-01-17 02:44:07 -0500	[diff] [blame]	1256	struct super_block *sb = fc->root->d_sb;
				1257	dput(fc->root);
Al Viro	35ac118	2019-01-12 00:20:54 -0500	[diff] [blame]	1258	deactivate_locked_super(sb);
Al Viro	6678889	2019-01-17 09:42:30 -0500	[diff] [blame]	1259	ret = 1;
				1260	}
				1261
				1262	if (unlikely(ret > 0)) {
Al Viro	35ac118	2019-01-12 00:20:54 -0500	[diff] [blame]	1263	msleep(10);
Al Viro	7feeef5	2019-01-16 21:23:02 -0500	[diff] [blame]	1264	return restart_syscall();
Zefan Li	9732adc	2017-04-19 10:15:59 +0800	[diff] [blame]	1265	}
Al Viro	71d883c	2019-01-17 02:44:07 -0500	[diff] [blame]	1266	return ret;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1267	}
				1268
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	1269	static int __init cgroup1_wq_init(void)
				1270	{
				1271	/*
				1272	* Used to destroy pidlists and separate to serve as flush domain.
				1273	* Cap @max_active to 1 too.
				1274	*/
				1275	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
				1276	0, 1);
				1277	BUG_ON(!cgroup_pidlist_destroy_wq);
				1278	return 0;
				1279	}
				1280	core_initcall(cgroup1_wq_init);
				1281
				1282	static int __init cgroup_no_v1(char *str)
				1283	{
				1284	struct cgroup_subsys *ss;
				1285	char *token;
				1286	int i;
				1287
				1288	while ((token = strsep(&str, ",")) != NULL) {
				1289	if (!*token)
				1290	continue;
				1291
				1292	if (!strcmp(token, "all")) {
				1293	cgroup_no_v1_mask = U16_MAX;
Tejun Heo	3fc9c12	2018-12-28 10:31:07 -0800	[diff] [blame]	1294	continue;
				1295	}
				1296
				1297	if (!strcmp(token, "named")) {
				1298	cgroup_no_v1_named = true;
				1299	continue;
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	1300	}
				1301
				1302	for_each_subsys(ss, i) {
				1303	if (strcmp(token, ss->name) &&
				1304	strcmp(token, ss->legacy_name))
				1305	continue;
				1306
				1307	cgroup_no_v1_mask \|= 1 << i;
				1308	}
				1309	}
				1310	return 1;
				1311	}
				1312	__setup("cgroup_no_v1=", cgroup_no_v1);