Blame - kernel/cgroup/cgroup-v1.c - SHIFTPHONES/mainline/linux

blob: 7af745a46f917b214e924f08f958e140d45297a2 [file] [log] [blame]

Tejun Heo	0a268db	2016-12-27 14:49:06 -0500	[diff] [blame^]	1	#include "cgroup-internal.h"
				2
				3	#include <linux/kmod.h>
				4	#include <linux/sort.h>
				5	#include <linux/mm.h>
				6	#include <linux/slab.h>
				7	#include <linux/vmalloc.h>
				8	#include <linux/delayacct.h>
				9	#include <linux/pid_namespace.h>
				10	#include <linux/cgroupstats.h>
				11
				12	#include <trace/events/cgroup.h>
				13
				14	/*
				15	* pidlists linger the following amount before being destroyed. The goal
				16	* is avoiding frequent destruction in the middle of consecutive read calls
				17	* Expiring in the middle is a performance problem not a correctness one.
				18	* 1 sec should be enough.
				19	*/
				20	#define CGROUP_PIDLIST_DESTROY_DELAY HZ
				21
				22	/* Controllers blocked by the commandline in v1 */
				23	static u16 cgroup_no_v1_mask;
				24
				25	/*
				26	* pidlist destructions need to be flushed on cgroup destruction. Use a
				27	* separate workqueue as flush domain.
				28	*/
				29	static struct workqueue_struct *cgroup_pidlist_destroy_wq;
				30
				31	/*
				32	* Protects cgroup_subsys->release_agent_path. Modifying it also requires
				33	* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
				34	*/
				35	DEFINE_SPINLOCK(release_agent_path_lock);
				36
				37	bool cgroup_ssid_no_v1(int ssid)
				38	{
				39	return cgroup_no_v1_mask & (1 << ssid);
				40	}
				41
				42	/**
				43	* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
				44	* @from: attach to all cgroups of a given task
				45	* @tsk: the task to be attached
				46	*/
				47	int cgroup_attach_task_all(struct task_struct from, struct task_struct tsk)
				48	{
				49	struct cgroup_root *root;
				50	int retval = 0;
				51
				52	mutex_lock(&cgroup_mutex);
				53	percpu_down_write(&cgroup_threadgroup_rwsem);
				54	for_each_root(root) {
				55	struct cgroup *from_cgrp;
				56
				57	if (root == &cgrp_dfl_root)
				58	continue;
				59
				60	spin_lock_irq(&css_set_lock);
				61	from_cgrp = task_cgroup_from_root(from, root);
				62	spin_unlock_irq(&css_set_lock);
				63
				64	retval = cgroup_attach_task(from_cgrp, tsk, false);
				65	if (retval)
				66	break;
				67	}
				68	percpu_up_write(&cgroup_threadgroup_rwsem);
				69	mutex_unlock(&cgroup_mutex);
				70
				71	return retval;
				72	}
				73	EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
				74
				75	/**
				76	* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
				77	* @to: cgroup to which the tasks will be moved
				78	* @from: cgroup in which the tasks currently reside
				79	*
				80	* Locking rules between cgroup_post_fork() and the migration path
				81	* guarantee that, if a task is forking while being migrated, the new child
				82	* is guaranteed to be either visible in the source cgroup after the
				83	* parent's migration is complete or put into the target cgroup. No task
				84	* can slip out of migration through forking.
				85	*/
				86	int cgroup_transfer_tasks(struct cgroup to, struct cgroup from)
				87	{
				88	LIST_HEAD(preloaded_csets);
				89	struct cgrp_cset_link *link;
				90	struct css_task_iter it;
				91	struct task_struct *task;
				92	int ret;
				93
				94	if (cgroup_on_dfl(to))
				95	return -EINVAL;
				96
				97	if (!cgroup_may_migrate_to(to))
				98	return -EBUSY;
				99
				100	mutex_lock(&cgroup_mutex);
				101
				102	percpu_down_write(&cgroup_threadgroup_rwsem);
				103
				104	/* all tasks in @from are being moved, all csets are source */
				105	spin_lock_irq(&css_set_lock);
				106	list_for_each_entry(link, &from->cset_links, cset_link)
				107	cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
				108	spin_unlock_irq(&css_set_lock);
				109
				110	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
				111	if (ret)
				112	goto out_err;
				113
				114	/*
				115	* Migrate tasks one-by-one until @from is empty. This fails iff
				116	* ->can_attach() fails.
				117	*/
				118	do {
				119	css_task_iter_start(&from->self, &it);
				120	task = css_task_iter_next(&it);
				121	if (task)
				122	get_task_struct(task);
				123	css_task_iter_end(&it);
				124
				125	if (task) {
				126	ret = cgroup_migrate(task, false, to->root);
				127	if (!ret)
				128	trace_cgroup_transfer_tasks(to, task, false);
				129	put_task_struct(task);
				130	}
				131	} while (task && !ret);
				132	out_err:
				133	cgroup_migrate_finish(&preloaded_csets);
				134	percpu_up_write(&cgroup_threadgroup_rwsem);
				135	mutex_unlock(&cgroup_mutex);
				136	return ret;
				137	}
				138
				139	/*
				140	* Stuff for reading the 'tasks'/'procs' files.
				141	*
				142	* Reading this file can return large amounts of data if a cgroup has
				143	* lots of attached tasks. So it may need several calls to read(),
				144	* but we cannot guarantee that the information we produce is correct
				145	* unless we produce it entirely atomically.
				146	*
				147	*/
				148
				149	/* which pidlist file are we talking about? */
				150	enum cgroup_filetype {
				151	CGROUP_FILE_PROCS,
				152	CGROUP_FILE_TASKS,
				153	};
				154
				155	/*
				156	* A pidlist is a list of pids that virtually represents the contents of one
				157	* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
				158	* a pair (one each for procs, tasks) for each pid namespace that's relevant
				159	* to the cgroup.
				160	*/
				161	struct cgroup_pidlist {
				162	/*
				163	* used to find which pidlist is wanted. doesn't change as long as
				164	* this particular list stays in the list.
				165	*/
				166	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
				167	/* array of xids */
				168	pid_t *list;
				169	/* how many elements the above list has */
				170	int length;
				171	/* each of these stored in a list by its cgroup */
				172	struct list_head links;
				173	/* pointer to the cgroup we belong to, for list removal purposes */
				174	struct cgroup *owner;
				175	/* for delayed destruction */
				176	struct delayed_work destroy_dwork;
				177	};
				178
				179	/*
				180	* The following two functions "fix" the issue where there are more pids
				181	* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
				182	* TODO: replace with a kernel-wide solution to this problem
				183	*/
				184	#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
				185	static void *pidlist_allocate(int count)
				186	{
				187	if (PIDLIST_TOO_LARGE(count))
				188	return vmalloc(count * sizeof(pid_t));
				189	else
				190	return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
				191	}
				192
				193	static void pidlist_free(void *p)
				194	{
				195	kvfree(p);
				196	}
				197
				198	/*
				199	* Used to destroy all pidlists lingering waiting for destroy timer. None
				200	* should be left afterwards.
				201	*/
				202	void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
				203	{
				204	struct cgroup_pidlist l, tmp_l;
				205
				206	mutex_lock(&cgrp->pidlist_mutex);
				207	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
				208	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
				209	mutex_unlock(&cgrp->pidlist_mutex);
				210
				211	flush_workqueue(cgroup_pidlist_destroy_wq);
				212	BUG_ON(!list_empty(&cgrp->pidlists));
				213	}
				214
				215	static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
				216	{
				217	struct delayed_work *dwork = to_delayed_work(work);
				218	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
				219	destroy_dwork);
				220	struct cgroup_pidlist *tofree = NULL;
				221
				222	mutex_lock(&l->owner->pidlist_mutex);
				223
				224	/*
				225	* Destroy iff we didn't get queued again. The state won't change
				226	* as destroy_dwork can only be queued while locked.
				227	*/
				228	if (!delayed_work_pending(dwork)) {
				229	list_del(&l->links);
				230	pidlist_free(l->list);
				231	put_pid_ns(l->key.ns);
				232	tofree = l;
				233	}
				234
				235	mutex_unlock(&l->owner->pidlist_mutex);
				236	kfree(tofree);
				237	}
				238
				239	/*
				240	* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
				241	* Returns the number of unique elements.
				242	*/
				243	static int pidlist_uniq(pid_t *list, int length)
				244	{
				245	int src, dest = 1;
				246
				247	/*
				248	* we presume the 0th element is unique, so i starts at 1. trivial
				249	* edge cases first; no work needs to be done for either
				250	*/
				251	if (length == 0 \|\| length == 1)
				252	return length;
				253	/* src and dest walk down the list; dest counts unique elements */
				254	for (src = 1; src < length; src++) {
				255	/* find next unique element */
				256	while (list[src] == list[src-1]) {
				257	src++;
				258	if (src == length)
				259	goto after;
				260	}
				261	/* dest always points to where the next unique element goes */
				262	list[dest] = list[src];
				263	dest++;
				264	}
				265	after:
				266	return dest;
				267	}
				268
				269	/*
				270	* The two pid files - task and cgroup.procs - guaranteed that the result
				271	* is sorted, which forced this whole pidlist fiasco. As pid order is
				272	* different per namespace, each namespace needs differently sorted list,
				273	* making it impossible to use, for example, single rbtree of member tasks
				274	* sorted by task pointer. As pidlists can be fairly large, allocating one
				275	* per open file is dangerous, so cgroup had to implement shared pool of
				276	* pidlists keyed by cgroup and namespace.
				277	*/
				278	static int cmppid(const void a, const void b)
				279	{
				280	return (pid_t )a - (pid_t )b;
				281	}
				282
				283	static struct cgroup_pidlist cgroup_pidlist_find(struct cgroup cgrp,
				284	enum cgroup_filetype type)
				285	{
				286	struct cgroup_pidlist *l;
				287	/* don't need task_nsproxy() if we're looking at ourself */
				288	struct pid_namespace *ns = task_active_pid_ns(current);
				289
				290	lockdep_assert_held(&cgrp->pidlist_mutex);
				291
				292	list_for_each_entry(l, &cgrp->pidlists, links)
				293	if (l->key.type == type && l->key.ns == ns)
				294	return l;
				295	return NULL;
				296	}
				297
				298	/*
				299	* find the appropriate pidlist for our purpose (given procs vs tasks)
				300	* returns with the lock on that pidlist already held, and takes care
				301	* of the use count, or returns NULL with no locks held if we're out of
				302	* memory.
				303	*/
				304	static struct cgroup_pidlist cgroup_pidlist_find_create(struct cgroup cgrp,
				305	enum cgroup_filetype type)
				306	{
				307	struct cgroup_pidlist *l;
				308
				309	lockdep_assert_held(&cgrp->pidlist_mutex);
				310
				311	l = cgroup_pidlist_find(cgrp, type);
				312	if (l)
				313	return l;
				314
				315	/* entry not found; create a new one */
				316	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
				317	if (!l)
				318	return l;
				319
				320	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
				321	l->key.type = type;
				322	/* don't need task_nsproxy() if we're looking at ourself */
				323	l->key.ns = get_pid_ns(task_active_pid_ns(current));
				324	l->owner = cgrp;
				325	list_add(&l->links, &cgrp->pidlists);
				326	return l;
				327	}
				328
				329	/**
				330	* cgroup_task_count - count the number of tasks in a cgroup.
				331	* @cgrp: the cgroup in question
				332	*
				333	* Return the number of tasks in the cgroup. The returned number can be
				334	* higher than the actual number of tasks due to css_set references from
				335	* namespace roots and temporary usages.
				336	*/
				337	static int cgroup_task_count(const struct cgroup *cgrp)
				338	{
				339	int count = 0;
				340	struct cgrp_cset_link *link;
				341
				342	spin_lock_irq(&css_set_lock);
				343	list_for_each_entry(link, &cgrp->cset_links, cset_link)
				344	count += atomic_read(&link->cset->refcount);
				345	spin_unlock_irq(&css_set_lock);
				346	return count;
				347	}
				348
				349	/*
				350	* Load a cgroup's pidarray with either procs' tgids or tasks' pids
				351	*/
				352	static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
				353	struct cgroup_pidlist **lp)
				354	{
				355	pid_t *array;
				356	int length;
				357	int pid, n = 0; /* used for populating the array */
				358	struct css_task_iter it;
				359	struct task_struct *tsk;
				360	struct cgroup_pidlist *l;
				361
				362	lockdep_assert_held(&cgrp->pidlist_mutex);
				363
				364	/*
				365	* If cgroup gets more users after we read count, we won't have
				366	* enough space - tough. This race is indistinguishable to the
				367	* caller from the case that the additional cgroup users didn't
				368	* show up until sometime later on.
				369	*/
				370	length = cgroup_task_count(cgrp);
				371	array = pidlist_allocate(length);
				372	if (!array)
				373	return -ENOMEM;
				374	/* now, populate the array */
				375	css_task_iter_start(&cgrp->self, &it);
				376	while ((tsk = css_task_iter_next(&it))) {
				377	if (unlikely(n == length))
				378	break;
				379	/* get tgid or pid for procs or tasks file respectively */
				380	if (type == CGROUP_FILE_PROCS)
				381	pid = task_tgid_vnr(tsk);
				382	else
				383	pid = task_pid_vnr(tsk);
				384	if (pid > 0) /* make sure to only use valid results */
				385	array[n++] = pid;
				386	}
				387	css_task_iter_end(&it);
				388	length = n;
				389	/* now sort & (if procs) strip out duplicates */
				390	sort(array, length, sizeof(pid_t), cmppid, NULL);
				391	if (type == CGROUP_FILE_PROCS)
				392	length = pidlist_uniq(array, length);
				393
				394	l = cgroup_pidlist_find_create(cgrp, type);
				395	if (!l) {
				396	pidlist_free(array);
				397	return -ENOMEM;
				398	}
				399
				400	/* store array, freeing old if necessary */
				401	pidlist_free(l->list);
				402	l->list = array;
				403	l->length = length;
				404	*lp = l;
				405	return 0;
				406	}
				407
				408	/*
				409	* seq_file methods for the tasks/procs files. The seq_file position is the
				410	* next pid to display; the seq_file iterator is a pointer to the pid
				411	* in the cgroup->l->list array.
				412	*/
				413
				414	static void cgroup_pidlist_start(struct seq_file s, loff_t *pos)
				415	{
				416	/*
				417	* Initially we receive a position value that corresponds to
				418	* one more than the last pid shown (or 0 on the first call or
				419	* after a seek to the start). Use a binary-search to find the
				420	* next pid to display, if any
				421	*/
				422	struct kernfs_open_file *of = s->private;
				423	struct cgroup *cgrp = seq_css(s)->cgroup;
				424	struct cgroup_pidlist *l;
				425	enum cgroup_filetype type = seq_cft(s)->private;
				426	int index = 0, pid = *pos;
				427	int *iter, ret;
				428
				429	mutex_lock(&cgrp->pidlist_mutex);
				430
				431	/*
				432	* !NULL @of->priv indicates that this isn't the first start()
				433	* after open. If the matching pidlist is around, we can use that.
				434	* Look for it. Note that @of->priv can't be used directly. It
				435	* could already have been destroyed.
				436	*/
				437	if (of->priv)
				438	of->priv = cgroup_pidlist_find(cgrp, type);
				439
				440	/*
				441	* Either this is the first start() after open or the matching
				442	* pidlist has been destroyed inbetween. Create a new one.
				443	*/
				444	if (!of->priv) {
				445	ret = pidlist_array_load(cgrp, type,
				446	(struct cgroup_pidlist **)&of->priv);
				447	if (ret)
				448	return ERR_PTR(ret);
				449	}
				450	l = of->priv;
				451
				452	if (pid) {
				453	int end = l->length;
				454
				455	while (index < end) {
				456	int mid = (index + end) / 2;
				457	if (l->list[mid] == pid) {
				458	index = mid;
				459	break;
				460	} else if (l->list[mid] <= pid)
				461	index = mid + 1;
				462	else
				463	end = mid;
				464	}
				465	}
				466	/* If we're off the end of the array, we're done */
				467	if (index >= l->length)
				468	return NULL;
				469	/* Update the abstract position to be the actual pid that we found */
				470	iter = l->list + index;
				471	pos = iter;
				472	return iter;
				473	}
				474
				475	static void cgroup_pidlist_stop(struct seq_file s, void v)
				476	{
				477	struct kernfs_open_file *of = s->private;
				478	struct cgroup_pidlist *l = of->priv;
				479
				480	if (l)
				481	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
				482	CGROUP_PIDLIST_DESTROY_DELAY);
				483	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
				484	}
				485
				486	static void cgroup_pidlist_next(struct seq_file s, void v, loff_t pos)
				487	{
				488	struct kernfs_open_file *of = s->private;
				489	struct cgroup_pidlist *l = of->priv;
				490	pid_t *p = v;
				491	pid_t *end = l->list + l->length;
				492	/*
				493	* Advance to the next pid in the array. If this goes off the
				494	* end, we're done
				495	*/
				496	p++;
				497	if (p >= end) {
				498	return NULL;
				499	} else {
				500	pos = p;
				501	return p;
				502	}
				503	}
				504
				505	static int cgroup_pidlist_show(struct seq_file s, void v)
				506	{
				507	seq_printf(s, "%d\n", (int )v);
				508
				509	return 0;
				510	}
				511
				512	static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
				513	char *buf, size_t nbytes, loff_t off)
				514	{
				515	return __cgroup_procs_write(of, buf, nbytes, off, false);
				516	}
				517
				518	static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
				519	char *buf, size_t nbytes, loff_t off)
				520	{
				521	struct cgroup *cgrp;
				522
				523	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
				524
				525	cgrp = cgroup_kn_lock_live(of->kn, false);
				526	if (!cgrp)
				527	return -ENODEV;
				528	spin_lock(&release_agent_path_lock);
				529	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
				530	sizeof(cgrp->root->release_agent_path));
				531	spin_unlock(&release_agent_path_lock);
				532	cgroup_kn_unlock(of->kn);
				533	return nbytes;
				534	}
				535
				536	static int cgroup_release_agent_show(struct seq_file seq, void v)
				537	{
				538	struct cgroup *cgrp = seq_css(seq)->cgroup;
				539
				540	spin_lock(&release_agent_path_lock);
				541	seq_puts(seq, cgrp->root->release_agent_path);
				542	spin_unlock(&release_agent_path_lock);
				543	seq_putc(seq, '\n');
				544	return 0;
				545	}
				546
				547	static int cgroup_sane_behavior_show(struct seq_file seq, void v)
				548	{
				549	seq_puts(seq, "0\n");
				550	return 0;
				551	}
				552
				553	static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
				554	struct cftype *cft)
				555	{
				556	return notify_on_release(css->cgroup);
				557	}
				558
				559	static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
				560	struct cftype *cft, u64 val)
				561	{
				562	if (val)
				563	set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				564	else
				565	clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				566	return 0;
				567	}
				568
				569	static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				570	struct cftype *cft)
				571	{
				572	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				573	}
				574
				575	static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				576	struct cftype *cft, u64 val)
				577	{
				578	if (val)
				579	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				580	else
				581	clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				582	return 0;
				583	}
				584
				585	/* cgroup core interface files for the legacy hierarchies */
				586	struct cftype cgroup_legacy_base_files[] = {
				587	{
				588	.name = "cgroup.procs",
				589	.seq_start = cgroup_pidlist_start,
				590	.seq_next = cgroup_pidlist_next,
				591	.seq_stop = cgroup_pidlist_stop,
				592	.seq_show = cgroup_pidlist_show,
				593	.private = CGROUP_FILE_PROCS,
				594	.write = cgroup_procs_write,
				595	},
				596	{
				597	.name = "cgroup.clone_children",
				598	.read_u64 = cgroup_clone_children_read,
				599	.write_u64 = cgroup_clone_children_write,
				600	},
				601	{
				602	.name = "cgroup.sane_behavior",
				603	.flags = CFTYPE_ONLY_ON_ROOT,
				604	.seq_show = cgroup_sane_behavior_show,
				605	},
				606	{
				607	.name = "tasks",
				608	.seq_start = cgroup_pidlist_start,
				609	.seq_next = cgroup_pidlist_next,
				610	.seq_stop = cgroup_pidlist_stop,
				611	.seq_show = cgroup_pidlist_show,
				612	.private = CGROUP_FILE_TASKS,
				613	.write = cgroup_tasks_write,
				614	},
				615	{
				616	.name = "notify_on_release",
				617	.read_u64 = cgroup_read_notify_on_release,
				618	.write_u64 = cgroup_write_notify_on_release,
				619	},
				620	{
				621	.name = "release_agent",
				622	.flags = CFTYPE_ONLY_ON_ROOT,
				623	.seq_show = cgroup_release_agent_show,
				624	.write = cgroup_release_agent_write,
				625	.max_write_len = PATH_MAX - 1,
				626	},
				627	{ } /* terminate */
				628	};
				629
				630	/* Display information about each subsystem and each hierarchy */
				631	static int proc_cgroupstats_show(struct seq_file m, void v)
				632	{
				633	struct cgroup_subsys *ss;
				634	int i;
				635
				636	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
				637	/*
				638	* ideally we don't want subsystems moving around while we do this.
				639	* cgroup_mutex is also necessary to guarantee an atomic snapshot of
				640	* subsys/hierarchy state.
				641	*/
				642	mutex_lock(&cgroup_mutex);
				643
				644	for_each_subsys(ss, i)
				645	seq_printf(m, "%s\t%d\t%d\t%d\n",
				646	ss->legacy_name, ss->root->hierarchy_id,
				647	atomic_read(&ss->root->nr_cgrps),
				648	cgroup_ssid_enabled(i));
				649
				650	mutex_unlock(&cgroup_mutex);
				651	return 0;
				652	}
				653
				654	static int cgroupstats_open(struct inode inode, struct file file)
				655	{
				656	return single_open(file, proc_cgroupstats_show, NULL);
				657	}
				658
				659	const struct file_operations proc_cgroupstats_operations = {
				660	.open = cgroupstats_open,
				661	.read = seq_read,
				662	.llseek = seq_lseek,
				663	.release = single_release,
				664	};
				665
				666	/**
				667	* cgroupstats_build - build and fill cgroupstats
				668	* @stats: cgroupstats to fill information into
				669	* @dentry: A dentry entry belonging to the cgroup for which stats have
				670	* been requested.
				671	*
				672	* Build and fill cgroupstats so that taskstats can export it to user
				673	* space.
				674	*/
				675	int cgroupstats_build(struct cgroupstats stats, struct dentry dentry)
				676	{
				677	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
				678	struct cgroup *cgrp;
				679	struct css_task_iter it;
				680	struct task_struct *tsk;
				681
				682	/* it should be kernfs_node belonging to cgroupfs and is a directory */
				683	if (dentry->d_sb->s_type != &cgroup_fs_type \|\| !kn \|\|
				684	kernfs_type(kn) != KERNFS_DIR)
				685	return -EINVAL;
				686
				687	mutex_lock(&cgroup_mutex);
				688
				689	/*
				690	* We aren't being called from kernfs and there's no guarantee on
				691	* @kn->priv's validity. For this and css_tryget_online_from_dir(),
				692	* @kn->priv is RCU safe. Let's do the RCU dancing.
				693	*/
				694	rcu_read_lock();
				695	cgrp = rcu_dereference(kn->priv);
				696	if (!cgrp \|\| cgroup_is_dead(cgrp)) {
				697	rcu_read_unlock();
				698	mutex_unlock(&cgroup_mutex);
				699	return -ENOENT;
				700	}
				701	rcu_read_unlock();
				702
				703	css_task_iter_start(&cgrp->self, &it);
				704	while ((tsk = css_task_iter_next(&it))) {
				705	switch (tsk->state) {
				706	case TASK_RUNNING:
				707	stats->nr_running++;
				708	break;
				709	case TASK_INTERRUPTIBLE:
				710	stats->nr_sleeping++;
				711	break;
				712	case TASK_UNINTERRUPTIBLE:
				713	stats->nr_uninterruptible++;
				714	break;
				715	case TASK_STOPPED:
				716	stats->nr_stopped++;
				717	break;
				718	default:
				719	if (delayacct_is_task_waiting_on_io(tsk))
				720	stats->nr_io_wait++;
				721	break;
				722	}
				723	}
				724	css_task_iter_end(&it);
				725
				726	mutex_unlock(&cgroup_mutex);
				727	return 0;
				728	}
				729
				730	void check_for_release(struct cgroup *cgrp)
				731	{
				732	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
				733	!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
				734	schedule_work(&cgrp->release_agent_work);
				735	}
				736
				737	/*
				738	* Notify userspace when a cgroup is released, by running the
				739	* configured release agent with the name of the cgroup (path
				740	* relative to the root of cgroup file system) as the argument.
				741	*
				742	* Most likely, this user command will try to rmdir this cgroup.
				743	*
				744	* This races with the possibility that some other task will be
				745	* attached to this cgroup before it is removed, or that some other
				746	* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
				747	* The presumed 'rmdir' will fail quietly if this cgroup is no longer
				748	* unused, and this cgroup will be reprieved from its death sentence,
				749	* to continue to serve a useful existence. Next time it's released,
				750	* we will get notified again, if it still has 'notify_on_release' set.
				751	*
				752	* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
				753	* means only wait until the task is successfully execve()'d. The
				754	* separate release agent task is forked by call_usermodehelper(),
				755	* then control in this thread returns here, without waiting for the
				756	* release agent task. We don't bother to wait because the caller of
				757	* this routine has no use for the exit status of the release agent
				758	* task, so no sense holding our caller up for that.
				759	*/
				760	void cgroup_release_agent(struct work_struct *work)
				761	{
				762	struct cgroup *cgrp =
				763	container_of(work, struct cgroup, release_agent_work);
				764	char pathbuf = NULL, agentbuf = NULL;
				765	char argv[3], envp[3];
				766	int ret;
				767
				768	mutex_lock(&cgroup_mutex);
				769
				770	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
				771	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
				772	if (!pathbuf \|\| !agentbuf)
				773	goto out;
				774
				775	spin_lock_irq(&css_set_lock);
				776	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
				777	spin_unlock_irq(&css_set_lock);
				778	if (ret < 0 \|\| ret >= PATH_MAX)
				779	goto out;
				780
				781	argv[0] = agentbuf;
				782	argv[1] = pathbuf;
				783	argv[2] = NULL;
				784
				785	/* minimal command environment */
				786	envp[0] = "HOME=/";
				787	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
				788	envp[2] = NULL;
				789
				790	mutex_unlock(&cgroup_mutex);
				791	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
				792	goto out_free;
				793	out:
				794	mutex_unlock(&cgroup_mutex);
				795	out_free:
				796	kfree(agentbuf);
				797	kfree(pathbuf);
				798	}
				799
				800	/*
				801	* cgroup_rename - Only allow simple rename of directories in place.
				802	*/
				803	int cgroup_rename(struct kernfs_node kn, struct kernfs_node new_parent,
				804	const char *new_name_str)
				805	{
				806	struct cgroup *cgrp = kn->priv;
				807	int ret;
				808
				809	if (kernfs_type(kn) != KERNFS_DIR)
				810	return -ENOTDIR;
				811	if (kn->parent != new_parent)
				812	return -EIO;
				813
				814	/*
				815	* This isn't a proper migration and its usefulness is very
				816	* limited. Disallow on the default hierarchy.
				817	*/
				818	if (cgroup_on_dfl(cgrp))
				819	return -EPERM;
				820
				821	/*
				822	* We're gonna grab cgroup_mutex which nests outside kernfs
				823	* active_ref. kernfs_rename() doesn't require active_ref
				824	* protection. Break them before grabbing cgroup_mutex.
				825	*/
				826	kernfs_break_active_protection(new_parent);
				827	kernfs_break_active_protection(kn);
				828
				829	mutex_lock(&cgroup_mutex);
				830
				831	ret = kernfs_rename(kn, new_parent, new_name_str);
				832	if (!ret)
				833	trace_cgroup_rename(cgrp);
				834
				835	mutex_unlock(&cgroup_mutex);
				836
				837	kernfs_unbreak_active_protection(kn);
				838	kernfs_unbreak_active_protection(new_parent);
				839	return ret;
				840	}
				841
				842	static int __init cgroup1_wq_init(void)
				843	{
				844	/*
				845	* Used to destroy pidlists and separate to serve as flush domain.
				846	* Cap @max_active to 1 too.
				847	*/
				848	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
				849	0, 1);
				850	BUG_ON(!cgroup_pidlist_destroy_wq);
				851	return 0;
				852	}
				853	core_initcall(cgroup1_wq_init);
				854
				855	static int __init cgroup_no_v1(char *str)
				856	{
				857	struct cgroup_subsys *ss;
				858	char *token;
				859	int i;
				860
				861	while ((token = strsep(&str, ",")) != NULL) {
				862	if (!*token)
				863	continue;
				864
				865	if (!strcmp(token, "all")) {
				866	cgroup_no_v1_mask = U16_MAX;
				867	break;
				868	}
				869
				870	for_each_subsys(ss, i) {
				871	if (strcmp(token, ss->name) &&
				872	strcmp(token, ss->legacy_name))
				873	continue;
				874
				875	cgroup_no_v1_mask \|= 1 << i;
				876	}
				877	}
				878	return 1;
				879	}
				880	__setup("cgroup_no_v1=", cgroup_no_v1);
				881
				882
				883	#ifdef CONFIG_CGROUP_DEBUG
				884	static struct cgroup_subsys_state *
				885	debug_css_alloc(struct cgroup_subsys_state *parent_css)
				886	{
				887	struct cgroup_subsys_state css = kzalloc(sizeof(css), GFP_KERNEL);
				888
				889	if (!css)
				890	return ERR_PTR(-ENOMEM);
				891
				892	return css;
				893	}
				894
				895	static void debug_css_free(struct cgroup_subsys_state *css)
				896	{
				897	kfree(css);
				898	}
				899
				900	static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
				901	struct cftype *cft)
				902	{
				903	return cgroup_task_count(css->cgroup);
				904	}
				905
				906	static u64 current_css_set_read(struct cgroup_subsys_state *css,
				907	struct cftype *cft)
				908	{
				909	return (u64)(unsigned long)current->cgroups;
				910	}
				911
				912	static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
				913	struct cftype *cft)
				914	{
				915	u64 count;
				916
				917	rcu_read_lock();
				918	count = atomic_read(&task_css_set(current)->refcount);
				919	rcu_read_unlock();
				920	return count;
				921	}
				922
				923	static int current_css_set_cg_links_read(struct seq_file seq, void v)
				924	{
				925	struct cgrp_cset_link *link;
				926	struct css_set *cset;
				927	char *name_buf;
				928
				929	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
				930	if (!name_buf)
				931	return -ENOMEM;
				932
				933	spin_lock_irq(&css_set_lock);
				934	rcu_read_lock();
				935	cset = rcu_dereference(current->cgroups);
				936	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
				937	struct cgroup *c = link->cgrp;
				938
				939	cgroup_name(c, name_buf, NAME_MAX + 1);
				940	seq_printf(seq, "Root %d group %s\n",
				941	c->root->hierarchy_id, name_buf);
				942	}
				943	rcu_read_unlock();
				944	spin_unlock_irq(&css_set_lock);
				945	kfree(name_buf);
				946	return 0;
				947	}
				948
				949	#define MAX_TASKS_SHOWN_PER_CSS 25
				950	static int cgroup_css_links_read(struct seq_file seq, void v)
				951	{
				952	struct cgroup_subsys_state *css = seq_css(seq);
				953	struct cgrp_cset_link *link;
				954
				955	spin_lock_irq(&css_set_lock);
				956	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
				957	struct css_set *cset = link->cset;
				958	struct task_struct *task;
				959	int count = 0;
				960
				961	seq_printf(seq, "css_set %p\n", cset);
				962
				963	list_for_each_entry(task, &cset->tasks, cg_list) {
				964	if (count++ > MAX_TASKS_SHOWN_PER_CSS)
				965	goto overflow;
				966	seq_printf(seq, " task %d\n", task_pid_vnr(task));
				967	}
				968
				969	list_for_each_entry(task, &cset->mg_tasks, cg_list) {
				970	if (count++ > MAX_TASKS_SHOWN_PER_CSS)
				971	goto overflow;
				972	seq_printf(seq, " task %d\n", task_pid_vnr(task));
				973	}
				974	continue;
				975	overflow:
				976	seq_puts(seq, " ...\n");
				977	}
				978	spin_unlock_irq(&css_set_lock);
				979	return 0;
				980	}
				981
				982	static u64 releasable_read(struct cgroup_subsys_state css, struct cftype cft)
				983	{
				984	return (!cgroup_is_populated(css->cgroup) &&
				985	!css_has_online_children(&css->cgroup->self));
				986	}
				987
				988	static struct cftype debug_files[] = {
				989	{
				990	.name = "taskcount",
				991	.read_u64 = debug_taskcount_read,
				992	},
				993
				994	{
				995	.name = "current_css_set",
				996	.read_u64 = current_css_set_read,
				997	},
				998
				999	{
				1000	.name = "current_css_set_refcount",
				1001	.read_u64 = current_css_set_refcount_read,
				1002	},
				1003
				1004	{
				1005	.name = "current_css_set_cg_links",
				1006	.seq_show = current_css_set_cg_links_read,
				1007	},
				1008
				1009	{
				1010	.name = "cgroup_css_links",
				1011	.seq_show = cgroup_css_links_read,
				1012	},
				1013
				1014	{
				1015	.name = "releasable",
				1016	.read_u64 = releasable_read,
				1017	},
				1018
				1019	{ } /* terminate */
				1020	};
				1021
				1022	struct cgroup_subsys debug_cgrp_subsys = {
				1023	.css_alloc = debug_css_alloc,
				1024	.css_free = debug_css_free,
				1025	.legacy_cftypes = debug_files,
				1026	};
				1027	#endif /* CONFIG_CGROUP_DEBUG */