Blame - kernel/cgroup/cgroup-v1.c - SHIFTPHONES/mainline/linux

blob: 51063e7a93c28a7c0ad5f8cca34072dffb85e650 [file] [log] [blame]

Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	1	#include "cgroup-internal.h"
				2
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	3	#include <linux/ctype.h>
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	4	#include <linux/kmod.h>
				5	#include <linux/sort.h>
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	6	#include <linux/delay.h>
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	7	#include <linux/mm.h>
Ingo Molnar	c3edc40	2017-02-02 08:35:14 +0100	[diff] [blame]	8	#include <linux/sched/signal.h>
Ingo Molnar	56cd697	2017-02-06 10:57:33 +0100	[diff] [blame]	9	#include <linux/sched/task.h>
Ingo Molnar	50ff9d1	2017-02-05 16:03:58 +0100	[diff] [blame]	10	#include <linux/magic.h>
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	11	#include <linux/slab.h>
				12	#include <linux/vmalloc.h>
				13	#include <linux/delayacct.h>
				14	#include <linux/pid_namespace.h>
				15	#include <linux/cgroupstats.h>
				16
				17	#include <trace/events/cgroup.h>
				18
				19	/*
				20	* pidlists linger the following amount before being destroyed. The goal
				21	* is avoiding frequent destruction in the middle of consecutive read calls
				22	* Expiring in the middle is a performance problem not a correctness one.
				23	* 1 sec should be enough.
				24	*/
				25	#define CGROUP_PIDLIST_DESTROY_DELAY HZ
				26
				27	/* Controllers blocked by the commandline in v1 */
				28	static u16 cgroup_no_v1_mask;
				29
				30	/*
				31	* pidlist destructions need to be flushed on cgroup destruction. Use a
				32	* separate workqueue as flush domain.
				33	*/
				34	static struct workqueue_struct *cgroup_pidlist_destroy_wq;
				35
				36	/*
				37	* Protects cgroup_subsys->release_agent_path. Modifying it also requires
				38	* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
				39	*/
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	40	static DEFINE_SPINLOCK(release_agent_path_lock);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	41
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	42	bool cgroup1_ssid_disabled(int ssid)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	43	{
				44	return cgroup_no_v1_mask & (1 << ssid);
				45	}
				46
				47	/**
				48	* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
				49	* @from: attach to all cgroups of a given task
				50	* @tsk: the task to be attached
				51	*/
				52	int cgroup_attach_task_all(struct task_struct from, struct task_struct tsk)
				53	{
				54	struct cgroup_root *root;
				55	int retval = 0;
				56
				57	mutex_lock(&cgroup_mutex);
				58	percpu_down_write(&cgroup_threadgroup_rwsem);
				59	for_each_root(root) {
				60	struct cgroup *from_cgrp;
				61
				62	if (root == &cgrp_dfl_root)
				63	continue;
				64
				65	spin_lock_irq(&css_set_lock);
				66	from_cgrp = task_cgroup_from_root(from, root);
				67	spin_unlock_irq(&css_set_lock);
				68
				69	retval = cgroup_attach_task(from_cgrp, tsk, false);
				70	if (retval)
				71	break;
				72	}
				73	percpu_up_write(&cgroup_threadgroup_rwsem);
				74	mutex_unlock(&cgroup_mutex);
				75
				76	return retval;
				77	}
				78	EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
				79
				80	/**
				81	* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
				82	* @to: cgroup to which the tasks will be moved
				83	* @from: cgroup in which the tasks currently reside
				84	*
				85	* Locking rules between cgroup_post_fork() and the migration path
				86	* guarantee that, if a task is forking while being migrated, the new child
				87	* is guaranteed to be either visible in the source cgroup after the
				88	* parent's migration is complete or put into the target cgroup. No task
				89	* can slip out of migration through forking.
				90	*/
				91	int cgroup_transfer_tasks(struct cgroup to, struct cgroup from)
				92	{
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	93	DEFINE_CGROUP_MGCTX(mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	94	struct cgrp_cset_link *link;
				95	struct css_task_iter it;
				96	struct task_struct *task;
				97	int ret;
				98
				99	if (cgroup_on_dfl(to))
				100	return -EINVAL;
				101
Tejun Heo	8cfd814	2017-07-21 11:14:51 -0400	[diff] [blame]	102	ret = cgroup_migrate_vet_dst(to);
				103	if (ret)
				104	return ret;
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	105
				106	mutex_lock(&cgroup_mutex);
				107
				108	percpu_down_write(&cgroup_threadgroup_rwsem);
				109
				110	/* all tasks in @from are being moved, all csets are source */
				111	spin_lock_irq(&css_set_lock);
				112	list_for_each_entry(link, &from->cset_links, cset_link)
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	113	cgroup_migrate_add_src(link->cset, to, &mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	114	spin_unlock_irq(&css_set_lock);
				115
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	116	ret = cgroup_migrate_prepare_dst(&mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	117	if (ret)
				118	goto out_err;
				119
				120	/*
				121	* Migrate tasks one-by-one until @from is empty. This fails iff
				122	* ->can_attach() fails.
				123	*/
				124	do {
Tejun Heo	bc2fb7e	2017-05-15 09:34:01 -0400	[diff] [blame]	125	css_task_iter_start(&from->self, 0, &it);
Prateek Sood	116d2f7	2017-12-19 12:56:57 +0530	[diff] [blame]	126
				127	do {
				128	task = css_task_iter_next(&it);
				129	} while (task && (task->flags & PF_EXITING));
				130
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	131	if (task)
				132	get_task_struct(task);
				133	css_task_iter_end(&it);
				134
				135	if (task) {
Tejun Heo	bfc2cf6	2017-01-15 19:03:41 -0500	[diff] [blame]	136	ret = cgroup_migrate(task, false, &mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	137	if (!ret)
Steven Rostedt (VMware)	e4f8d81	2018-07-09 17:48:54 -0400	[diff] [blame]	138	TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	139	put_task_struct(task);
				140	}
				141	} while (task && !ret);
				142	out_err:
Tejun Heo	e595cd7	2017-01-15 19:03:41 -0500	[diff] [blame]	143	cgroup_migrate_finish(&mgctx);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	144	percpu_up_write(&cgroup_threadgroup_rwsem);
				145	mutex_unlock(&cgroup_mutex);
				146	return ret;
				147	}
				148
				149	/*
				150	* Stuff for reading the 'tasks'/'procs' files.
				151	*
				152	* Reading this file can return large amounts of data if a cgroup has
				153	* lots of attached tasks. So it may need several calls to read(),
				154	* but we cannot guarantee that the information we produce is correct
				155	* unless we produce it entirely atomically.
				156	*
				157	*/
				158
				159	/* which pidlist file are we talking about? */
				160	enum cgroup_filetype {
				161	CGROUP_FILE_PROCS,
				162	CGROUP_FILE_TASKS,
				163	};
				164
				165	/*
				166	* A pidlist is a list of pids that virtually represents the contents of one
				167	* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
				168	* a pair (one each for procs, tasks) for each pid namespace that's relevant
				169	* to the cgroup.
				170	*/
				171	struct cgroup_pidlist {
				172	/*
				173	* used to find which pidlist is wanted. doesn't change as long as
				174	* this particular list stays in the list.
				175	*/
				176	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
				177	/* array of xids */
				178	pid_t *list;
				179	/* how many elements the above list has */
				180	int length;
				181	/* each of these stored in a list by its cgroup */
				182	struct list_head links;
				183	/* pointer to the cgroup we belong to, for list removal purposes */
				184	struct cgroup *owner;
				185	/* for delayed destruction */
				186	struct delayed_work destroy_dwork;
				187	};
				188
				189	/*
				190	* The following two functions "fix" the issue where there are more pids
				191	* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
				192	* TODO: replace with a kernel-wide solution to this problem
				193	*/
				194	#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
				195	static void *pidlist_allocate(int count)
				196	{
				197	if (PIDLIST_TOO_LARGE(count))
Kees Cook	42bc47b	2018-06-12 14:27:11 -0700	[diff] [blame]	198	return vmalloc(array_size(count, sizeof(pid_t)));
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	199	else
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	200	return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	201	}
				202
				203	static void pidlist_free(void *p)
				204	{
				205	kvfree(p);
				206	}
				207
				208	/*
				209	* Used to destroy all pidlists lingering waiting for destroy timer. None
				210	* should be left afterwards.
				211	*/
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	212	void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	213	{
				214	struct cgroup_pidlist l, tmp_l;
				215
				216	mutex_lock(&cgrp->pidlist_mutex);
				217	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
				218	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
				219	mutex_unlock(&cgrp->pidlist_mutex);
				220
				221	flush_workqueue(cgroup_pidlist_destroy_wq);
				222	BUG_ON(!list_empty(&cgrp->pidlists));
				223	}
				224
				225	static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
				226	{
				227	struct delayed_work *dwork = to_delayed_work(work);
				228	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
				229	destroy_dwork);
				230	struct cgroup_pidlist *tofree = NULL;
				231
				232	mutex_lock(&l->owner->pidlist_mutex);
				233
				234	/*
				235	* Destroy iff we didn't get queued again. The state won't change
				236	* as destroy_dwork can only be queued while locked.
				237	*/
				238	if (!delayed_work_pending(dwork)) {
				239	list_del(&l->links);
				240	pidlist_free(l->list);
				241	put_pid_ns(l->key.ns);
				242	tofree = l;
				243	}
				244
				245	mutex_unlock(&l->owner->pidlist_mutex);
				246	kfree(tofree);
				247	}
				248
				249	/*
				250	* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
				251	* Returns the number of unique elements.
				252	*/
				253	static int pidlist_uniq(pid_t *list, int length)
				254	{
				255	int src, dest = 1;
				256
				257	/*
				258	* we presume the 0th element is unique, so i starts at 1. trivial
				259	* edge cases first; no work needs to be done for either
				260	*/
				261	if (length == 0 \|\| length == 1)
				262	return length;
				263	/* src and dest walk down the list; dest counts unique elements */
				264	for (src = 1; src < length; src++) {
				265	/* find next unique element */
				266	while (list[src] == list[src-1]) {
				267	src++;
				268	if (src == length)
				269	goto after;
				270	}
				271	/* dest always points to where the next unique element goes */
				272	list[dest] = list[src];
				273	dest++;
				274	}
				275	after:
				276	return dest;
				277	}
				278
				279	/*
				280	* The two pid files - task and cgroup.procs - guaranteed that the result
				281	* is sorted, which forced this whole pidlist fiasco. As pid order is
				282	* different per namespace, each namespace needs differently sorted list,
				283	* making it impossible to use, for example, single rbtree of member tasks
				284	* sorted by task pointer. As pidlists can be fairly large, allocating one
				285	* per open file is dangerous, so cgroup had to implement shared pool of
				286	* pidlists keyed by cgroup and namespace.
				287	*/
				288	static int cmppid(const void a, const void b)
				289	{
				290	return (pid_t )a - (pid_t )b;
				291	}
				292
				293	static struct cgroup_pidlist cgroup_pidlist_find(struct cgroup cgrp,
				294	enum cgroup_filetype type)
				295	{
				296	struct cgroup_pidlist *l;
				297	/* don't need task_nsproxy() if we're looking at ourself */
				298	struct pid_namespace *ns = task_active_pid_ns(current);
				299
				300	lockdep_assert_held(&cgrp->pidlist_mutex);
				301
				302	list_for_each_entry(l, &cgrp->pidlists, links)
				303	if (l->key.type == type && l->key.ns == ns)
				304	return l;
				305	return NULL;
				306	}
				307
				308	/*
				309	* find the appropriate pidlist for our purpose (given procs vs tasks)
				310	* returns with the lock on that pidlist already held, and takes care
				311	* of the use count, or returns NULL with no locks held if we're out of
				312	* memory.
				313	*/
				314	static struct cgroup_pidlist cgroup_pidlist_find_create(struct cgroup cgrp,
				315	enum cgroup_filetype type)
				316	{
				317	struct cgroup_pidlist *l;
				318
				319	lockdep_assert_held(&cgrp->pidlist_mutex);
				320
				321	l = cgroup_pidlist_find(cgrp, type);
				322	if (l)
				323	return l;
				324
				325	/* entry not found; create a new one */
				326	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
				327	if (!l)
				328	return l;
				329
				330	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
				331	l->key.type = type;
				332	/* don't need task_nsproxy() if we're looking at ourself */
				333	l->key.ns = get_pid_ns(task_active_pid_ns(current));
				334	l->owner = cgrp;
				335	list_add(&l->links, &cgrp->pidlists);
				336	return l;
				337	}
				338
				339	/**
				340	* cgroup_task_count - count the number of tasks in a cgroup.
				341	* @cgrp: the cgroup in question
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	342	*/
Waiman Long	a28f8f5	2017-06-13 17:18:02 -0400	[diff] [blame]	343	int cgroup_task_count(const struct cgroup *cgrp)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	344	{
				345	int count = 0;
				346	struct cgrp_cset_link *link;
				347
				348	spin_lock_irq(&css_set_lock);
				349	list_for_each_entry(link, &cgrp->cset_links, cset_link)
Waiman Long	73a7242	2017-06-13 17:18:01 -0400	[diff] [blame]	350	count += link->cset->nr_tasks;
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	351	spin_unlock_irq(&css_set_lock);
				352	return count;
				353	}
				354
				355	/*
				356	* Load a cgroup's pidarray with either procs' tgids or tasks' pids
				357	*/
				358	static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
				359	struct cgroup_pidlist **lp)
				360	{
				361	pid_t *array;
				362	int length;
				363	int pid, n = 0; /* used for populating the array */
				364	struct css_task_iter it;
				365	struct task_struct *tsk;
				366	struct cgroup_pidlist *l;
				367
				368	lockdep_assert_held(&cgrp->pidlist_mutex);
				369
				370	/*
				371	* If cgroup gets more users after we read count, we won't have
				372	* enough space - tough. This race is indistinguishable to the
				373	* caller from the case that the additional cgroup users didn't
				374	* show up until sometime later on.
				375	*/
				376	length = cgroup_task_count(cgrp);
				377	array = pidlist_allocate(length);
				378	if (!array)
				379	return -ENOMEM;
				380	/* now, populate the array */
Tejun Heo	bc2fb7e	2017-05-15 09:34:01 -0400	[diff] [blame]	381	css_task_iter_start(&cgrp->self, 0, &it);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	382	while ((tsk = css_task_iter_next(&it))) {
				383	if (unlikely(n == length))
				384	break;
				385	/* get tgid or pid for procs or tasks file respectively */
				386	if (type == CGROUP_FILE_PROCS)
				387	pid = task_tgid_vnr(tsk);
				388	else
				389	pid = task_pid_vnr(tsk);
				390	if (pid > 0) /* make sure to only use valid results */
				391	array[n++] = pid;
				392	}
				393	css_task_iter_end(&it);
				394	length = n;
				395	/* now sort & (if procs) strip out duplicates */
				396	sort(array, length, sizeof(pid_t), cmppid, NULL);
				397	if (type == CGROUP_FILE_PROCS)
				398	length = pidlist_uniq(array, length);
				399
				400	l = cgroup_pidlist_find_create(cgrp, type);
				401	if (!l) {
				402	pidlist_free(array);
				403	return -ENOMEM;
				404	}
				405
				406	/* store array, freeing old if necessary */
				407	pidlist_free(l->list);
				408	l->list = array;
				409	l->length = length;
				410	*lp = l;
				411	return 0;
				412	}
				413
				414	/*
				415	* seq_file methods for the tasks/procs files. The seq_file position is the
				416	* next pid to display; the seq_file iterator is a pointer to the pid
				417	* in the cgroup->l->list array.
				418	*/
				419
				420	static void cgroup_pidlist_start(struct seq_file s, loff_t *pos)
				421	{
				422	/*
				423	* Initially we receive a position value that corresponds to
				424	* one more than the last pid shown (or 0 on the first call or
				425	* after a seek to the start). Use a binary-search to find the
				426	* next pid to display, if any
				427	*/
				428	struct kernfs_open_file *of = s->private;
				429	struct cgroup *cgrp = seq_css(s)->cgroup;
				430	struct cgroup_pidlist *l;
				431	enum cgroup_filetype type = seq_cft(s)->private;
				432	int index = 0, pid = *pos;
				433	int *iter, ret;
				434
				435	mutex_lock(&cgrp->pidlist_mutex);
				436
				437	/*
				438	* !NULL @of->priv indicates that this isn't the first start()
				439	* after open. If the matching pidlist is around, we can use that.
				440	* Look for it. Note that @of->priv can't be used directly. It
				441	* could already have been destroyed.
				442	*/
				443	if (of->priv)
				444	of->priv = cgroup_pidlist_find(cgrp, type);
				445
				446	/*
				447	* Either this is the first start() after open or the matching
				448	* pidlist has been destroyed inbetween. Create a new one.
				449	*/
				450	if (!of->priv) {
				451	ret = pidlist_array_load(cgrp, type,
				452	(struct cgroup_pidlist **)&of->priv);
				453	if (ret)
				454	return ERR_PTR(ret);
				455	}
				456	l = of->priv;
				457
				458	if (pid) {
				459	int end = l->length;
				460
				461	while (index < end) {
				462	int mid = (index + end) / 2;
				463	if (l->list[mid] == pid) {
				464	index = mid;
				465	break;
				466	} else if (l->list[mid] <= pid)
				467	index = mid + 1;
				468	else
				469	end = mid;
				470	}
				471	}
				472	/* If we're off the end of the array, we're done */
				473	if (index >= l->length)
				474	return NULL;
				475	/* Update the abstract position to be the actual pid that we found */
				476	iter = l->list + index;
				477	pos = iter;
				478	return iter;
				479	}
				480
				481	static void cgroup_pidlist_stop(struct seq_file s, void v)
				482	{
				483	struct kernfs_open_file *of = s->private;
				484	struct cgroup_pidlist *l = of->priv;
				485
				486	if (l)
				487	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
				488	CGROUP_PIDLIST_DESTROY_DELAY);
				489	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
				490	}
				491
				492	static void cgroup_pidlist_next(struct seq_file s, void v, loff_t pos)
				493	{
				494	struct kernfs_open_file *of = s->private;
				495	struct cgroup_pidlist *l = of->priv;
				496	pid_t *p = v;
				497	pid_t *end = l->list + l->length;
				498	/*
				499	* Advance to the next pid in the array. If this goes off the
				500	* end, we're done
				501	*/
				502	p++;
				503	if (p >= end) {
				504	return NULL;
				505	} else {
				506	pos = p;
				507	return p;
				508	}
				509	}
				510
				511	static int cgroup_pidlist_show(struct seq_file s, void v)
				512	{
				513	seq_printf(s, "%d\n", (int )v);
				514
				515	return 0;
				516	}
				517
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	518	static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
				519	char *buf, size_t nbytes, loff_t off,
				520	bool threadgroup)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	521	{
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	522	struct cgroup *cgrp;
				523	struct task_struct *task;
				524	const struct cred cred, tcred;
				525	ssize_t ret;
				526
				527	cgrp = cgroup_kn_lock_live(of->kn, false);
				528	if (!cgrp)
				529	return -ENODEV;
				530
				531	task = cgroup_procs_write_start(buf, threadgroup);
				532	ret = PTR_ERR_OR_ZERO(task);
				533	if (ret)
				534	goto out_unlock;
				535
				536	/*
				537	* Even if we're attaching all tasks in the thread group, we only
				538	* need to check permissions on one of them.
				539	*/
				540	cred = current_cred();
				541	tcred = get_task_cred(task);
				542	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
				543	!uid_eq(cred->euid, tcred->uid) &&
				544	!uid_eq(cred->euid, tcred->suid))
				545	ret = -EACCES;
				546	put_cred(tcred);
				547	if (ret)
				548	goto out_finish;
				549
				550	ret = cgroup_attach_task(cgrp, task, threadgroup);
				551
				552	out_finish:
				553	cgroup_procs_write_finish(task);
				554	out_unlock:
				555	cgroup_kn_unlock(of->kn);
				556
				557	return ret ?: nbytes;
				558	}
				559
				560	static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
				561	char *buf, size_t nbytes, loff_t off)
				562	{
				563	return __cgroup1_procs_write(of, buf, nbytes, off, true);
				564	}
				565
				566	static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
				567	char *buf, size_t nbytes, loff_t off)
				568	{
				569	return __cgroup1_procs_write(of, buf, nbytes, off, false);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	570	}
				571
				572	static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
				573	char *buf, size_t nbytes, loff_t off)
				574	{
				575	struct cgroup *cgrp;
				576
				577	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
				578
				579	cgrp = cgroup_kn_lock_live(of->kn, false);
				580	if (!cgrp)
				581	return -ENODEV;
				582	spin_lock(&release_agent_path_lock);
				583	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
				584	sizeof(cgrp->root->release_agent_path));
				585	spin_unlock(&release_agent_path_lock);
				586	cgroup_kn_unlock(of->kn);
				587	return nbytes;
				588	}
				589
				590	static int cgroup_release_agent_show(struct seq_file seq, void v)
				591	{
				592	struct cgroup *cgrp = seq_css(seq)->cgroup;
				593
				594	spin_lock(&release_agent_path_lock);
				595	seq_puts(seq, cgrp->root->release_agent_path);
				596	spin_unlock(&release_agent_path_lock);
				597	seq_putc(seq, '\n');
				598	return 0;
				599	}
				600
				601	static int cgroup_sane_behavior_show(struct seq_file seq, void v)
				602	{
				603	seq_puts(seq, "0\n");
				604	return 0;
				605	}
				606
				607	static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
				608	struct cftype *cft)
				609	{
				610	return notify_on_release(css->cgroup);
				611	}
				612
				613	static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
				614	struct cftype *cft, u64 val)
				615	{
				616	if (val)
				617	set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				618	else
				619	clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				620	return 0;
				621	}
				622
				623	static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				624	struct cftype *cft)
				625	{
				626	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				627	}
				628
				629	static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				630	struct cftype *cft, u64 val)
				631	{
				632	if (val)
				633	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				634	else
				635	clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				636	return 0;
				637	}
				638
				639	/* cgroup core interface files for the legacy hierarchies */
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	640	struct cftype cgroup1_base_files[] = {
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	641	{
				642	.name = "cgroup.procs",
				643	.seq_start = cgroup_pidlist_start,
				644	.seq_next = cgroup_pidlist_next,
				645	.seq_stop = cgroup_pidlist_stop,
				646	.seq_show = cgroup_pidlist_show,
				647	.private = CGROUP_FILE_PROCS,
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	648	.write = cgroup1_procs_write,
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	649	},
				650	{
				651	.name = "cgroup.clone_children",
				652	.read_u64 = cgroup_clone_children_read,
				653	.write_u64 = cgroup_clone_children_write,
				654	},
				655	{
				656	.name = "cgroup.sane_behavior",
				657	.flags = CFTYPE_ONLY_ON_ROOT,
				658	.seq_show = cgroup_sane_behavior_show,
				659	},
				660	{
				661	.name = "tasks",
				662	.seq_start = cgroup_pidlist_start,
				663	.seq_next = cgroup_pidlist_next,
				664	.seq_stop = cgroup_pidlist_stop,
				665	.seq_show = cgroup_pidlist_show,
				666	.private = CGROUP_FILE_TASKS,
Tejun Heo	715c809	2017-05-15 09:34:00 -0400	[diff] [blame]	667	.write = cgroup1_tasks_write,
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	668	},
				669	{
				670	.name = "notify_on_release",
				671	.read_u64 = cgroup_read_notify_on_release,
				672	.write_u64 = cgroup_write_notify_on_release,
				673	},
				674	{
				675	.name = "release_agent",
				676	.flags = CFTYPE_ONLY_ON_ROOT,
				677	.seq_show = cgroup_release_agent_show,
				678	.write = cgroup_release_agent_write,
				679	.max_write_len = PATH_MAX - 1,
				680	},
				681	{ } /* terminate */
				682	};
				683
				684	/* Display information about each subsystem and each hierarchy */
Christoph Hellwig	3f3942a	2018-05-15 15:57:23 +0200	[diff] [blame]	685	int proc_cgroupstats_show(struct seq_file m, void v)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	686	{
				687	struct cgroup_subsys *ss;
				688	int i;
				689
				690	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
				691	/*
				692	* ideally we don't want subsystems moving around while we do this.
				693	* cgroup_mutex is also necessary to guarantee an atomic snapshot of
				694	* subsys/hierarchy state.
				695	*/
				696	mutex_lock(&cgroup_mutex);
				697
				698	for_each_subsys(ss, i)
				699	seq_printf(m, "%s\t%d\t%d\t%d\n",
				700	ss->legacy_name, ss->root->hierarchy_id,
				701	atomic_read(&ss->root->nr_cgrps),
				702	cgroup_ssid_enabled(i));
				703
				704	mutex_unlock(&cgroup_mutex);
				705	return 0;
				706	}
				707
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	708	/**
				709	* cgroupstats_build - build and fill cgroupstats
				710	* @stats: cgroupstats to fill information into
				711	* @dentry: A dentry entry belonging to the cgroup for which stats have
				712	* been requested.
				713	*
				714	* Build and fill cgroupstats so that taskstats can export it to user
				715	* space.
				716	*/
				717	int cgroupstats_build(struct cgroupstats stats, struct dentry dentry)
				718	{
				719	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
				720	struct cgroup *cgrp;
				721	struct css_task_iter it;
				722	struct task_struct *tsk;
				723
				724	/* it should be kernfs_node belonging to cgroupfs and is a directory */
				725	if (dentry->d_sb->s_type != &cgroup_fs_type \|\| !kn \|\|
				726	kernfs_type(kn) != KERNFS_DIR)
				727	return -EINVAL;
				728
				729	mutex_lock(&cgroup_mutex);
				730
				731	/*
				732	* We aren't being called from kernfs and there's no guarantee on
				733	* @kn->priv's validity. For this and css_tryget_online_from_dir(),
				734	* @kn->priv is RCU safe. Let's do the RCU dancing.
				735	*/
				736	rcu_read_lock();
Tejun Heo	e0aed7c	2016-12-27 14:49:09 -0500	[diff] [blame]	737	cgrp = rcu_dereference((void __rcu __force *)&kn->priv);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	738	if (!cgrp \|\| cgroup_is_dead(cgrp)) {
				739	rcu_read_unlock();
				740	mutex_unlock(&cgroup_mutex);
				741	return -ENOENT;
				742	}
				743	rcu_read_unlock();
				744
Tejun Heo	bc2fb7e	2017-05-15 09:34:01 -0400	[diff] [blame]	745	css_task_iter_start(&cgrp->self, 0, &it);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	746	while ((tsk = css_task_iter_next(&it))) {
				747	switch (tsk->state) {
				748	case TASK_RUNNING:
				749	stats->nr_running++;
				750	break;
				751	case TASK_INTERRUPTIBLE:
				752	stats->nr_sleeping++;
				753	break;
				754	case TASK_UNINTERRUPTIBLE:
				755	stats->nr_uninterruptible++;
				756	break;
				757	case TASK_STOPPED:
				758	stats->nr_stopped++;
				759	break;
				760	default:
				761	if (delayacct_is_task_waiting_on_io(tsk))
				762	stats->nr_io_wait++;
				763	break;
				764	}
				765	}
				766	css_task_iter_end(&it);
				767
				768	mutex_unlock(&cgroup_mutex);
				769	return 0;
				770	}
				771
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	772	void cgroup1_check_for_release(struct cgroup *cgrp)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	773	{
				774	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
				775	!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
				776	schedule_work(&cgrp->release_agent_work);
				777	}
				778
				779	/*
				780	* Notify userspace when a cgroup is released, by running the
				781	* configured release agent with the name of the cgroup (path
				782	* relative to the root of cgroup file system) as the argument.
				783	*
				784	* Most likely, this user command will try to rmdir this cgroup.
				785	*
				786	* This races with the possibility that some other task will be
				787	* attached to this cgroup before it is removed, or that some other
				788	* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
				789	* The presumed 'rmdir' will fail quietly if this cgroup is no longer
				790	* unused, and this cgroup will be reprieved from its death sentence,
				791	* to continue to serve a useful existence. Next time it's released,
				792	* we will get notified again, if it still has 'notify_on_release' set.
				793	*
				794	* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
				795	* means only wait until the task is successfully execve()'d. The
				796	* separate release agent task is forked by call_usermodehelper(),
				797	* then control in this thread returns here, without waiting for the
				798	* release agent task. We don't bother to wait because the caller of
				799	* this routine has no use for the exit status of the release agent
				800	* task, so no sense holding our caller up for that.
				801	*/
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	802	void cgroup1_release_agent(struct work_struct *work)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	803	{
				804	struct cgroup *cgrp =
				805	container_of(work, struct cgroup, release_agent_work);
				806	char pathbuf = NULL, agentbuf = NULL;
				807	char argv[3], envp[3];
				808	int ret;
				809
				810	mutex_lock(&cgroup_mutex);
				811
				812	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
				813	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
				814	if (!pathbuf \|\| !agentbuf)
				815	goto out;
				816
				817	spin_lock_irq(&css_set_lock);
				818	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
				819	spin_unlock_irq(&css_set_lock);
				820	if (ret < 0 \|\| ret >= PATH_MAX)
				821	goto out;
				822
				823	argv[0] = agentbuf;
				824	argv[1] = pathbuf;
				825	argv[2] = NULL;
				826
				827	/* minimal command environment */
				828	envp[0] = "HOME=/";
				829	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
				830	envp[2] = NULL;
				831
				832	mutex_unlock(&cgroup_mutex);
				833	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
				834	goto out_free;
				835	out:
				836	mutex_unlock(&cgroup_mutex);
				837	out_free:
				838	kfree(agentbuf);
				839	kfree(pathbuf);
				840	}
				841
				842	/*
				843	* cgroup_rename - Only allow simple rename of directories in place.
				844	*/
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	845	static int cgroup1_rename(struct kernfs_node kn, struct kernfs_node new_parent,
				846	const char *new_name_str)
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	847	{
				848	struct cgroup *cgrp = kn->priv;
				849	int ret;
				850
				851	if (kernfs_type(kn) != KERNFS_DIR)
				852	return -ENOTDIR;
				853	if (kn->parent != new_parent)
				854	return -EIO;
				855
				856	/*
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	857	* We're gonna grab cgroup_mutex which nests outside kernfs
				858	* active_ref. kernfs_rename() doesn't require active_ref
				859	* protection. Break them before grabbing cgroup_mutex.
				860	*/
				861	kernfs_break_active_protection(new_parent);
				862	kernfs_break_active_protection(kn);
				863
				864	mutex_lock(&cgroup_mutex);
				865
				866	ret = kernfs_rename(kn, new_parent, new_name_str);
				867	if (!ret)
Steven Rostedt (VMware)	e4f8d81	2018-07-09 17:48:54 -0400	[diff] [blame]	868	TRACE_CGROUP_PATH(rename, cgrp);
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	869
				870	mutex_unlock(&cgroup_mutex);
				871
				872	kernfs_unbreak_active_protection(kn);
				873	kernfs_unbreak_active_protection(new_parent);
				874	return ret;
				875	}
				876
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	877	static int cgroup1_show_options(struct seq_file seq, struct kernfs_root kf_root)
				878	{
				879	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
				880	struct cgroup_subsys *ss;
				881	int ssid;
				882
				883	for_each_subsys(ss, ssid)
				884	if (root->subsys_mask & (1 << ssid))
				885	seq_show_option(seq, ss->legacy_name, NULL);
				886	if (root->flags & CGRP_ROOT_NOPREFIX)
				887	seq_puts(seq, ",noprefix");
				888	if (root->flags & CGRP_ROOT_XATTR)
				889	seq_puts(seq, ",xattr");
Waiman Long	e1cba4b	2017-08-17 15:33:09 -0400	[diff] [blame]	890	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
				891	seq_puts(seq, ",cpuset_v2_mode");
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	892
				893	spin_lock(&release_agent_path_lock);
				894	if (strlen(root->release_agent_path))
				895	seq_show_option(seq, "release_agent",
				896	root->release_agent_path);
				897	spin_unlock(&release_agent_path_lock);
				898
				899	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
				900	seq_puts(seq, ",clone_children");
				901	if (strlen(root->name))
				902	seq_show_option(seq, "name", root->name);
				903	return 0;
				904	}
				905
				906	static int parse_cgroupfs_options(char data, struct cgroup_sb_opts opts)
				907	{
				908	char token, o = data;
				909	bool all_ss = false, one_ss = false;
				910	u16 mask = U16_MAX;
				911	struct cgroup_subsys *ss;
				912	int nr_opts = 0;
				913	int i;
				914
				915	#ifdef CONFIG_CPUSETS
				916	mask = ~((u16)1 << cpuset_cgrp_id);
				917	#endif
				918
				919	memset(opts, 0, sizeof(*opts));
				920
				921	while ((token = strsep(&o, ",")) != NULL) {
				922	nr_opts++;
				923
				924	if (!*token)
				925	return -EINVAL;
				926	if (!strcmp(token, "none")) {
				927	/* Explicitly have no subsystems */
				928	opts->none = true;
				929	continue;
				930	}
				931	if (!strcmp(token, "all")) {
				932	/* Mutually exclusive option 'all' + subsystem name */
				933	if (one_ss)
				934	return -EINVAL;
				935	all_ss = true;
				936	continue;
				937	}
				938	if (!strcmp(token, "noprefix")) {
				939	opts->flags \|= CGRP_ROOT_NOPREFIX;
				940	continue;
				941	}
				942	if (!strcmp(token, "clone_children")) {
				943	opts->cpuset_clone_children = true;
				944	continue;
				945	}
Waiman Long	e1cba4b	2017-08-17 15:33:09 -0400	[diff] [blame]	946	if (!strcmp(token, "cpuset_v2_mode")) {
				947	opts->flags \|= CGRP_ROOT_CPUSET_V2_MODE;
				948	continue;
				949	}
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	950	if (!strcmp(token, "xattr")) {
				951	opts->flags \|= CGRP_ROOT_XATTR;
				952	continue;
				953	}
				954	if (!strncmp(token, "release_agent=", 14)) {
				955	/* Specifying two release agents is forbidden */
				956	if (opts->release_agent)
				957	return -EINVAL;
				958	opts->release_agent =
				959	kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
				960	if (!opts->release_agent)
				961	return -ENOMEM;
				962	continue;
				963	}
				964	if (!strncmp(token, "name=", 5)) {
				965	const char *name = token + 5;
				966	/* Can't specify an empty name */
				967	if (!strlen(name))
				968	return -EINVAL;
				969	/* Must match [\w.-]+ */
				970	for (i = 0; i < strlen(name); i++) {
				971	char c = name[i];
				972	if (isalnum(c))
				973	continue;
				974	if ((c == '.') \|\| (c == '-') \|\| (c == '_'))
				975	continue;
				976	return -EINVAL;
				977	}
				978	/* Specifying two names is forbidden */
				979	if (opts->name)
				980	return -EINVAL;
				981	opts->name = kstrndup(name,
				982	MAX_CGROUP_ROOT_NAMELEN - 1,
				983	GFP_KERNEL);
				984	if (!opts->name)
				985	return -ENOMEM;
				986
				987	continue;
				988	}
				989
				990	for_each_subsys(ss, i) {
				991	if (strcmp(token, ss->legacy_name))
				992	continue;
				993	if (!cgroup_ssid_enabled(i))
				994	continue;
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	995	if (cgroup1_ssid_disabled(i))
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	996	continue;
				997
				998	/* Mutually exclusive option 'all' + subsystem name */
				999	if (all_ss)
				1000	return -EINVAL;
				1001	opts->subsys_mask \|= (1 << i);
				1002	one_ss = true;
				1003
				1004	break;
				1005	}
				1006	if (i == CGROUP_SUBSYS_COUNT)
				1007	return -ENOENT;
				1008	}
				1009
				1010	/*
				1011	* If the 'all' option was specified select all the subsystems,
				1012	* otherwise if 'none', 'name=' and a subsystem name options were
				1013	* not specified, let's default to 'all'
				1014	*/
				1015	if (all_ss \|\| (!one_ss && !opts->none && !opts->name))
				1016	for_each_subsys(ss, i)
Tejun Heo	d62beb7	2016-12-27 14:49:08 -0500	[diff] [blame]	1017	if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1018	opts->subsys_mask \|= (1 << i);
				1019
				1020	/*
				1021	* We either have to specify by name or by subsystems. (So all
				1022	* empty hierarchies must have a name).
				1023	*/
				1024	if (!opts->subsys_mask && !opts->name)
				1025	return -EINVAL;
				1026
				1027	/*
				1028	* Option noprefix was introduced just for backward compatibility
				1029	* with the old cpuset, so we allow noprefix only if mounting just
				1030	* the cpuset subsystem.
				1031	*/
				1032	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
				1033	return -EINVAL;
				1034
				1035	/* Can't specify "none" and some subsystems */
				1036	if (opts->subsys_mask && opts->none)
				1037	return -EINVAL;
				1038
				1039	return 0;
				1040	}
				1041
				1042	static int cgroup1_remount(struct kernfs_root kf_root, int flags, char *data)
				1043	{
				1044	int ret = 0;
				1045	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
				1046	struct cgroup_sb_opts opts;
				1047	u16 added_mask, removed_mask;
				1048
				1049	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
				1050
				1051	/* See what subsystems are wanted */
				1052	ret = parse_cgroupfs_options(data, &opts);
				1053	if (ret)
				1054	goto out_unlock;
				1055
				1056	if (opts.subsys_mask != root->subsys_mask \|\| opts.release_agent)
				1057	pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
				1058	task_tgid_nr(current), current->comm);
				1059
				1060	added_mask = opts.subsys_mask & ~root->subsys_mask;
				1061	removed_mask = root->subsys_mask & ~opts.subsys_mask;
				1062
				1063	/* Don't allow flags or name to change at remount */
				1064	if ((opts.flags ^ root->flags) \|\|
				1065	(opts.name && strcmp(opts.name, root->name))) {
				1066	pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
				1067	opts.flags, opts.name ?: "", root->flags, root->name);
				1068	ret = -EINVAL;
				1069	goto out_unlock;
				1070	}
				1071
				1072	/* remounting is not allowed for populated hierarchies */
				1073	if (!list_empty(&root->cgrp.self.children)) {
				1074	ret = -EBUSY;
				1075	goto out_unlock;
				1076	}
				1077
				1078	ret = rebind_subsystems(root, added_mask);
				1079	if (ret)
				1080	goto out_unlock;
				1081
				1082	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
				1083
				1084	if (opts.release_agent) {
				1085	spin_lock(&release_agent_path_lock);
				1086	strcpy(root->release_agent_path, opts.release_agent);
				1087	spin_unlock(&release_agent_path_lock);
				1088	}
				1089
				1090	trace_cgroup_remount(root);
				1091
				1092	out_unlock:
				1093	kfree(opts.release_agent);
				1094	kfree(opts.name);
				1095	mutex_unlock(&cgroup_mutex);
				1096	return ret;
				1097	}
				1098
				1099	struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
				1100	.rename = cgroup1_rename,
				1101	.show_options = cgroup1_show_options,
				1102	.remount_fs = cgroup1_remount,
				1103	.mkdir = cgroup_mkdir,
				1104	.rmdir = cgroup_rmdir,
				1105	.show_path = cgroup_show_path,
				1106	};
				1107
				1108	struct dentry cgroup1_mount(struct file_system_type fs_type, int flags,
				1109	void *data, unsigned long magic,
				1110	struct cgroup_namespace *ns)
				1111	{
				1112	struct super_block *pinned_sb = NULL;
				1113	struct cgroup_sb_opts opts;
				1114	struct cgroup_root *root;
				1115	struct cgroup_subsys *ss;
				1116	struct dentry *dentry;
				1117	int i, ret;
Zefan Li	9732adc	2017-04-19 10:15:59 +0800	[diff] [blame]	1118	bool new_root = false;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1119
				1120	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
				1121
				1122	/* First find the desired set of subsystems */
				1123	ret = parse_cgroupfs_options(data, &opts);
				1124	if (ret)
				1125	goto out_unlock;
				1126
				1127	/*
				1128	* Destruction of cgroup root is asynchronous, so subsystems may
				1129	* still be dying after the previous unmount. Let's drain the
				1130	* dying subsystems. We just need to ensure that the ones
				1131	* unmounted previously finish dying and don't care about new ones
				1132	* starting. Testing ref liveliness is good enough.
				1133	*/
				1134	for_each_subsys(ss, i) {
				1135	if (!(opts.subsys_mask & (1 << i)) \|\|
				1136	ss->root == &cgrp_dfl_root)
				1137	continue;
				1138
				1139	if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
				1140	mutex_unlock(&cgroup_mutex);
				1141	msleep(10);
				1142	ret = restart_syscall();
				1143	goto out_free;
				1144	}
				1145	cgroup_put(&ss->root->cgrp);
				1146	}
				1147
				1148	for_each_root(root) {
				1149	bool name_match = false;
				1150
				1151	if (root == &cgrp_dfl_root)
				1152	continue;
				1153
				1154	/*
				1155	* If we asked for a name then it must match. Also, if
				1156	* name matches but sybsys_mask doesn't, we should fail.
				1157	* Remember whether name matched.
				1158	*/
				1159	if (opts.name) {
				1160	if (strcmp(opts.name, root->name))
				1161	continue;
				1162	name_match = true;
				1163	}
				1164
				1165	/*
				1166	* If we asked for subsystems (or explicitly for no
				1167	* subsystems) then they must match.
				1168	*/
				1169	if ((opts.subsys_mask \|\| opts.none) &&
				1170	(opts.subsys_mask != root->subsys_mask)) {
				1171	if (!name_match)
				1172	continue;
				1173	ret = -EBUSY;
				1174	goto out_unlock;
				1175	}
				1176
				1177	if (root->flags ^ opts.flags)
				1178	pr_warn("new mount options do not match the existing superblock, will be ignored\n");
				1179
				1180	/*
				1181	* We want to reuse @root whose lifetime is governed by its
				1182	* ->cgrp. Let's check whether @root is alive and keep it
				1183	* that way. As cgroup_kill_sb() can happen anytime, we
				1184	* want to block it by pinning the sb so that @root doesn't
				1185	* get killed before mount is complete.
				1186	*
				1187	* With the sb pinned, tryget_live can reliably indicate
				1188	* whether @root can be reused. If it's being killed,
				1189	* drain it. We can use wait_queue for the wait but this
				1190	* path is super cold. Let's just sleep a bit and retry.
				1191	*/
				1192	pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
Tejun Heo	330c418	2017-04-16 23:17:37 +0900	[diff] [blame]	1193	if (IS_ERR(pinned_sb) \|\|
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1194	!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
				1195	mutex_unlock(&cgroup_mutex);
				1196	if (!IS_ERR_OR_NULL(pinned_sb))
				1197	deactivate_super(pinned_sb);
				1198	msleep(10);
				1199	ret = restart_syscall();
				1200	goto out_free;
				1201	}
				1202
				1203	ret = 0;
				1204	goto out_unlock;
				1205	}
				1206
				1207	/*
				1208	* No such thing, create a new one. name= matching without subsys
				1209	* specification is allowed for already existing hierarchies but we
				1210	* can't create new one without subsys specification.
				1211	*/
				1212	if (!opts.subsys_mask && !opts.none) {
				1213	ret = -EINVAL;
				1214	goto out_unlock;
				1215	}
				1216
				1217	/* Hierarchies may only be created in the initial cgroup namespace. */
				1218	if (ns != &init_cgroup_ns) {
				1219	ret = -EPERM;
				1220	goto out_unlock;
				1221	}
				1222
				1223	root = kzalloc(sizeof(*root), GFP_KERNEL);
				1224	if (!root) {
				1225	ret = -ENOMEM;
				1226	goto out_unlock;
				1227	}
Zefan Li	9732adc	2017-04-19 10:15:59 +0800	[diff] [blame]	1228	new_root = true;
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1229
				1230	init_cgroup_root(root, &opts);
				1231
Zefan Li	9732adc	2017-04-19 10:15:59 +0800	[diff] [blame]	1232	ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1233	if (ret)
				1234	cgroup_free_root(root);
				1235
				1236	out_unlock:
				1237	mutex_unlock(&cgroup_mutex);
				1238	out_free:
				1239	kfree(opts.release_agent);
				1240	kfree(opts.name);
				1241
				1242	if (ret)
				1243	return ERR_PTR(ret);
				1244
				1245	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
				1246	CGROUP_SUPER_MAGIC, ns);
				1247
				1248	/*
Zefan Li	9732adc	2017-04-19 10:15:59 +0800	[diff] [blame]	1249	* There's a race window after we release cgroup_mutex and before
				1250	* allocating a superblock. Make sure a concurrent process won't
				1251	* be able to re-use the root during this window by delaying the
				1252	* initialization of root refcnt.
				1253	*/
				1254	if (new_root) {
				1255	mutex_lock(&cgroup_mutex);
				1256	percpu_ref_reinit(&root->cgrp.self.refcnt);
				1257	mutex_unlock(&cgroup_mutex);
				1258	}
				1259
				1260	/*
Tejun Heo	1592c9b	2016-12-27 14:49:08 -0500	[diff] [blame]	1261	* If @pinned_sb, we're reusing an existing root and holding an
				1262	* extra ref on its sb. Mount is complete. Put the extra ref.
				1263	*/
				1264	if (pinned_sb)
				1265	deactivate_super(pinned_sb);
				1266
				1267	return dentry;
				1268	}
				1269
Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame]	1270	static int __init cgroup1_wq_init(void)
				1271	{
				1272	/*
				1273	* Used to destroy pidlists and separate to serve as flush domain.
				1274	* Cap @max_active to 1 too.
				1275	*/
				1276	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
				1277	0, 1);
				1278	BUG_ON(!cgroup_pidlist_destroy_wq);
				1279	return 0;
				1280	}
				1281	core_initcall(cgroup1_wq_init);
				1282
				1283	static int __init cgroup_no_v1(char *str)
				1284	{
				1285	struct cgroup_subsys *ss;
				1286	char *token;
				1287	int i;
				1288
				1289	while ((token = strsep(&str, ",")) != NULL) {
				1290	if (!*token)
				1291	continue;
				1292
				1293	if (!strcmp(token, "all")) {
				1294	cgroup_no_v1_mask = U16_MAX;
				1295	break;
				1296	}
				1297
				1298	for_each_subsys(ss, i) {
				1299	if (strcmp(token, ss->name) &&
				1300	strcmp(token, ss->legacy_name))
				1301	continue;
				1302
				1303	cgroup_no_v1_mask \|= 1 << i;
				1304	}
				1305	}
				1306	return 1;
				1307	}
				1308	__setup("cgroup_no_v1=", cgroup_no_v1);