blob: 8e0d7092afbb009623e03e9bab8ec9a1657d2111 [file] [log] [blame]
Paul Menageddbcc7e2007-10-18 23:39:30 -07001/*
Paul Menageddbcc7e2007-10-18 23:39:30 -07002 * Generic process-grouping system.
3 *
4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc
6 *
Kirill A. Shutemov0dea1162010-03-10 15:22:20 -08007 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
Paul Menageddbcc7e2007-10-18 23:39:30 -070011 * Copyright notices from the original cpuset code:
12 * --------------------------------------------------
13 * Copyright (C) 2003 BULL SA.
14 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
15 *
16 * Portions derived from Patrick Mochel's sysfs code.
17 * sysfs is Copyright (c) 2001-3 Patrick Mochel
18 *
19 * 2003-10-10 Written by Simon Derr.
20 * 2003-10-22 Updates by Stephen Hemminger.
21 * 2004 May-July Rework by Paul Jackson.
22 * ---------------------------------------------------
23 *
24 * This file is subject to the terms and conditions of the GNU General Public
25 * License. See the file COPYING in the main directory of the Linux
26 * distribution for more details.
27 */
28
Joe Perchesed3d2612014-04-25 18:28:03 -040029#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
Tejun Heo0a268db2016-12-27 14:49:06 -050031#include "cgroup-internal.h"
32
eparis@redhat2ce97382011-06-02 21:20:51 +100033#include <linux/cred.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070034#include <linux/errno.h>
eparis@redhat2ce97382011-06-02 21:20:51 +100035#include <linux/init_task.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070036#include <linux/kernel.h>
Jianyu Zhanc9482a52014-04-26 15:40:28 +080037#include <linux/magic.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070038#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
Paul Menagea4243162007-10-18 23:39:35 -070041#include <linux/proc_fs.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070042#include <linux/rcupdate.h>
43#include <linux/sched.h>
Ingo Molnar29930022017-02-08 18:51:36 +010044#include <linux/sched/task.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070045#include <linux/slab.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070046#include <linux/spinlock.h>
Tejun Heo1ed13282015-09-16 12:53:17 -040047#include <linux/percpu-rwsem.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070048#include <linux/string.h>
Li Zefan0ac801f2013-01-10 11:49:27 +080049#include <linux/hashtable.h>
Paul Menage2c6ab6d2009-09-23 15:56:23 -070050#include <linux/idr.h>
Mike Galbraithc4c27fb2012-04-21 09:13:46 +020051#include <linux/kthread.h>
Arun Sharma600634972011-07-26 16:09:06 -070052#include <linux/atomic.h>
Tejun Heoe93ad192016-01-19 12:18:41 -050053#include <linux/cpuset.h>
Aditya Kalia79a9082016-01-29 02:54:06 -060054#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
Martin KaFai Lau1f3fe7e2016-06-30 10:28:42 -070056#include <linux/file.h>
Al Viroe34a98d2019-01-17 00:22:58 -050057#include <linux/fs_parser.h>
Tejun Heod4ff7492018-04-26 14:29:04 -070058#include <linux/sched/cputime.h>
Johannes Weiner2ce71352018-10-26 15:06:31 -070059#include <linux/psi.h>
Tejun Heobd1060a2015-12-07 17:38:53 -050060#include <net/sock.h>
Paul Menageddbcc7e2007-10-18 23:39:30 -070061
Tejun Heoed1777d2016-08-10 11:23:44 -040062#define CREATE_TRACE_POINTS
63#include <trace/events/cgroup.h>
64
Tejun Heo8d7e6fb2014-02-11 11:52:48 -050065#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
66 MAX_CFTYPE_NAME + 2)
Tejun Heob12e3582018-04-26 14:29:04 -070067/* let's not notify more than 100 times per second */
68#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
Tejun Heo8d7e6fb2014-02-11 11:52:48 -050069
Tejun Heob1a21362013-11-29 10:42:58 -050070/*
Tejun Heoe25e2cb2011-12-12 18:12:21 -080071 * cgroup_mutex is the master lock. Any modification to cgroup or its
72 * hierarchy must be performed while holding it.
73 *
Tejun Heof0d9a5f2015-10-15 16:41:53 -040074 * css_set_lock protects task->cgroups pointer, the list of css_set
Tejun Heo0e1d7682014-02-25 10:04:03 -050075 * objects, and the chain of tasks off each css_set.
Tejun Heoe25e2cb2011-12-12 18:12:21 -080076 *
Tejun Heo0e1d7682014-02-25 10:04:03 -050077 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
78 * cgroup.h can use them for lockdep annotations.
Tejun Heoe25e2cb2011-12-12 18:12:21 -080079 */
Tejun Heo22194492013-04-07 09:29:51 -070080DEFINE_MUTEX(cgroup_mutex);
Tejun Heof0d9a5f2015-10-15 16:41:53 -040081DEFINE_SPINLOCK(css_set_lock);
Tejun Heo0a268db2016-12-27 14:49:06 -050082
83#ifdef CONFIG_PROVE_RCU
Tejun Heo0e1d7682014-02-25 10:04:03 -050084EXPORT_SYMBOL_GPL(cgroup_mutex);
Tejun Heof0d9a5f2015-10-15 16:41:53 -040085EXPORT_SYMBOL_GPL(css_set_lock);
Tejun Heo22194492013-04-07 09:29:51 -070086#endif
87
Steven Rostedt (VMware)e4f8d812018-07-09 17:48:54 -040088DEFINE_SPINLOCK(trace_cgroup_path_lock);
89char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
Waiman Long5cf81142018-11-08 10:08:46 -050090bool cgroup_debug __read_mostly;
Steven Rostedt (VMware)e4f8d812018-07-09 17:48:54 -040091
Tejun Heo69e943b2014-02-08 10:36:58 -050092/*
Tejun Heo15a4c832014-05-04 15:09:14 -040093 * Protects cgroup_idr and css_idr so that IDs can be released without
94 * grabbing cgroup_mutex.
Tejun Heo6fa49182014-05-04 15:09:13 -040095 */
96static DEFINE_SPINLOCK(cgroup_idr_lock);
97
98/*
Tejun Heo34c06252015-11-05 00:12:24 -050099 * Protects cgroup_file->kn for !self csses. It synchronizes notifications
100 * against file removal/re-creation across css hiding.
101 */
102static DEFINE_SPINLOCK(cgroup_file_kn_lock);
103
Oleg Nesterov3f2947b2019-04-23 18:32:41 +0200104DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
Tejun Heo1ed13282015-09-16 12:53:17 -0400105
Tejun Heo8353da12014-05-13 12:19:23 -0400106#define cgroup_assert_mutex_or_rcu_locked() \
Paul E. McKenneyf78f5b92015-06-18 15:50:02 -0700107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \
Tejun Heo8353da12014-05-13 12:19:23 -0400109 "cgroup_mutex or RCU read lock required");
Tejun Heo780cd8b2013-12-06 15:11:56 -0500110
Ben Blumaae8aab2010-03-10 15:22:07 -0800111/*
Tejun Heoe5fca242013-11-22 17:14:39 -0500112 * cgroup destruction makes heavy use of work items and there can be a lot
113 * of concurrent destructions. Use a separate workqueue so that cgroup
114 * destruction work items don't end up filling up max_active of system_wq
115 * which may lead to deadlock.
116 */
117static struct workqueue_struct *cgroup_destroy_wq;
118
Tejun Heo3ed80a62014-02-08 10:36:58 -0500119/* generate an array of cgroup subsystem pointers */
Tejun Heo073219e2014-02-08 10:36:58 -0500120#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
Tejun Heo0a268db2016-12-27 14:49:06 -0500121struct cgroup_subsys *cgroup_subsys[] = {
Paul Menageddbcc7e2007-10-18 23:39:30 -0700122#include <linux/cgroup_subsys.h>
123};
Tejun Heo073219e2014-02-08 10:36:58 -0500124#undef SUBSYS
125
126/* array of cgroup subsystem names */
127#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
128static const char *cgroup_subsys_name[] = {
129#include <linux/cgroup_subsys.h>
130};
131#undef SUBSYS
Paul Menageddbcc7e2007-10-18 23:39:30 -0700132
Tejun Heo49d1dc42015-09-18 11:56:28 -0400133/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
134#define SUBSYS(_x) \
135 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
136 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
137 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
138 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
139#include <linux/cgroup_subsys.h>
140#undef SUBSYS
141
142#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
143static struct static_key_true *cgroup_subsys_enabled_key[] = {
144#include <linux/cgroup_subsys.h>
145};
146#undef SUBSYS
147
148#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
149static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
150#include <linux/cgroup_subsys.h>
151};
152#undef SUBSYS
153
Tejun Heoc58632b2018-04-26 14:29:04 -0700154static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
Tejun Heo041cd642017-09-25 08:12:05 -0700155
Zefan Li6b6ebb32020-05-13 10:13:11 +0800156/* the default hierarchy */
Tejun Heoc58632b2018-04-26 14:29:04 -0700157struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
Tejun Heod0ec4232015-08-05 16:03:19 -0400158EXPORT_SYMBOL_GPL(cgrp_dfl_root);
Tejun Heo9871bf92013-06-24 15:21:47 -0700159
Tejun Heoa2dd4242014-03-19 10:23:55 -0400160/*
161 * The default hierarchy always exists but is hidden until mounted for the
162 * first time. This is for backward compatibility.
163 */
Tejun Heoa7165262016-02-23 10:00:50 -0500164static bool cgrp_dfl_visible;
Paul Menageddbcc7e2007-10-18 23:39:30 -0700165
Tejun Heo5533e012014-05-14 19:33:07 -0400166/* some controllers are not supported in the default hierarchy */
Tejun Heoa7165262016-02-23 10:00:50 -0500167static u16 cgrp_dfl_inhibit_ss_mask;
Tejun Heo5533e012014-05-14 19:33:07 -0400168
Tejun Heof6d635ad2016-03-08 11:51:26 -0500169/* some controllers are implicitly enabled on the default hierarchy */
Tejun Heob8074212017-01-20 12:06:08 -0500170static u16 cgrp_dfl_implicit_ss_mask;
Tejun Heof6d635ad2016-03-08 11:51:26 -0500171
Tejun Heo8cfd8142017-07-21 11:14:51 -0400172/* some controllers can be threaded on the default hierarchy */
173static u16 cgrp_dfl_threaded_ss_mask;
174
Paul Menageddbcc7e2007-10-18 23:39:30 -0700175/* The list of hierarchy roots */
Tejun Heo0a268db2016-12-27 14:49:06 -0500176LIST_HEAD(cgroup_roots);
Tejun Heo9871bf92013-06-24 15:21:47 -0700177static int cgroup_root_count;
Paul Menageddbcc7e2007-10-18 23:39:30 -0700178
Tejun Heo3417ae12014-02-08 10:37:01 -0500179/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
Tejun Heo1a574232013-04-14 11:36:58 -0700180static DEFINE_IDR(cgroup_hierarchy_idr);
Paul Menage2c6ab6d2009-09-23 15:56:23 -0700181
Li Zefan794611a2013-06-18 18:53:53 +0800182/*
Tejun Heo0cb51d72014-05-16 13:22:49 -0400183 * Assign a monotonically increasing serial number to csses. It guarantees
184 * cgroups with bigger numbers are newer than those with smaller numbers.
185 * Also, as csses are always appended to the parent's ->children list, it
186 * guarantees that sibling csses are always sorted in the ascending serial
187 * number order on the list. Protected by cgroup_mutex.
Li Zefan794611a2013-06-18 18:53:53 +0800188 */
Tejun Heo0cb51d72014-05-16 13:22:49 -0400189static u64 css_serial_nr_next = 1;
Li Zefan794611a2013-06-18 18:53:53 +0800190
Aleksa Saraicb4a3162015-06-06 10:02:14 +1000191/*
Tejun Heob8074212017-01-20 12:06:08 -0500192 * These bitmasks identify subsystems with specific features to avoid
193 * having to do iterative checks repeatedly.
Paul Menageddbcc7e2007-10-18 23:39:30 -0700194 */
Tejun Heo6e5c8302016-02-22 22:25:47 -0500195static u16 have_fork_callback __read_mostly;
196static u16 have_exit_callback __read_mostly;
Oleg Nesterov51bee5a2019-01-28 17:00:13 +0100197static u16 have_release_callback __read_mostly;
Tejun Heob8074212017-01-20 12:06:08 -0500198static u16 have_canfork_callback __read_mostly;
Paul Menageddbcc7e2007-10-18 23:39:30 -0700199
Aditya Kalia79a9082016-01-29 02:54:06 -0600200/* cgroup namespace for init task */
201struct cgroup_namespace init_cgroup_ns = {
Kirill Tkhaif3878822020-08-03 13:16:50 +0300202 .ns.count = REFCOUNT_INIT(2),
Aditya Kalia79a9082016-01-29 02:54:06 -0600203 .user_ns = &init_user_ns,
204 .ns.ops = &cgroupns_operations,
205 .ns.inum = PROC_CGROUP_INIT_INO,
206 .root_cset = &init_css_set,
207};
208
Tejun Heo67e9c742015-11-16 11:13:34 -0500209static struct file_system_type cgroup2_fs_type;
Tejun Heod62beb72016-12-27 14:49:08 -0500210static struct cftype cgroup_base_files[];
Tejun Heo628f7cd2013-06-28 16:24:11 -0700211
Tejun Heo334c3672016-03-03 09:58:01 -0500212static int cgroup_apply_control(struct cgroup *cgrp);
213static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
Tejun Heob636fd32019-05-31 10:38:58 -0700214static void css_task_iter_skip(struct css_task_iter *it,
215 struct task_struct *task);
Tejun Heo42809dd2012-11-19 08:13:37 -0800216static int cgroup_destroy_locked(struct cgroup *cgrp);
Tejun Heo6cd0f5b2016-03-03 09:57:58 -0500217static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
218 struct cgroup_subsys *ss);
Tejun Heo9d755d32014-05-14 09:15:02 -0400219static void css_release(struct percpu_ref *ref);
Tejun Heof8f22e52014-04-23 11:13:16 -0400220static void kill_css(struct cgroup_subsys_state *css);
Tejun Heo4df8dc92015-09-18 17:54:23 -0400221static int cgroup_addrm_files(struct cgroup_subsys_state *css,
222 struct cgroup *cgrp, struct cftype cfts[],
Tejun Heo2bb566c2013-08-08 20:11:23 -0400223 bool is_add);
Tejun Heo42809dd2012-11-19 08:13:37 -0800224
Tejun Heofc5ed1e2015-09-18 11:56:28 -0400225/**
226 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
227 * @ssid: subsys ID of interest
228 *
229 * cgroup_subsys_enabled() can only be used with literal subsys names which
230 * is fine for individual subsystems but unsuitable for cgroup core. This
231 * is slower static_key_enabled() based test indexed by @ssid.
232 */
Tejun Heo0a268db2016-12-27 14:49:06 -0500233bool cgroup_ssid_enabled(int ssid)
Tejun Heofc5ed1e2015-09-18 11:56:28 -0400234{
Arnd Bergmanncfe02a82016-03-15 00:21:06 +0100235 if (CGROUP_SUBSYS_COUNT == 0)
236 return false;
237
Tejun Heofc5ed1e2015-09-18 11:56:28 -0400238 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
239}
240
Tejun Heo9e10a132015-09-18 11:56:28 -0400241/**
242 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
243 * @cgrp: the cgroup of interest
244 *
245 * The default hierarchy is the v2 interface of cgroup and this function
246 * can be used to test whether a cgroup is on the default hierarchy for
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +0530247 * cases where a subsystem should behave differently depending on the
Tejun Heo9e10a132015-09-18 11:56:28 -0400248 * interface version.
249 *
Tejun Heo9e10a132015-09-18 11:56:28 -0400250 * List of changed behaviors:
251 *
252 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
253 * and "name" are disallowed.
254 *
255 * - When mounting an existing superblock, mount options should match.
256 *
257 * - Remount is disallowed.
258 *
259 * - rename(2) is disallowed.
260 *
261 * - "tasks" is removed. Everything should be at process granularity. Use
262 * "cgroup.procs" instead.
263 *
264 * - "cgroup.procs" is not sorted. pids will be unique unless they got
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +0530265 * recycled in-between reads.
Tejun Heo9e10a132015-09-18 11:56:28 -0400266 *
267 * - "release_agent" and "notify_on_release" are removed. Replacement
268 * notification mechanism will be implemented.
269 *
270 * - "cgroup.clone_children" is removed.
271 *
272 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
273 * and its descendants contain no task; otherwise, 1. The file also
274 * generates kernfs notification which can be monitored through poll and
275 * [di]notify when the value of the file changes.
276 *
277 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
278 * take masks of ancestors with non-empty cpus/mems, instead of being
279 * moved to an ancestor.
280 *
281 * - cpuset: a task can be moved into an empty cpuset, and again it takes
282 * masks of ancestors.
283 *
Tejun Heo9e10a132015-09-18 11:56:28 -0400284 * - blkcg: blk-throttle becomes properly hierarchical.
285 *
286 * - debug: disallowed on the default hierarchy.
287 */
Tejun Heo0a268db2016-12-27 14:49:06 -0500288bool cgroup_on_dfl(const struct cgroup *cgrp)
Tejun Heo9e10a132015-09-18 11:56:28 -0400289{
290 return cgrp->root == &cgrp_dfl_root;
291}
292
Tejun Heo6fa49182014-05-04 15:09:13 -0400293/* IDR wrappers which synchronize using cgroup_idr_lock */
294static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
295 gfp_t gfp_mask)
296{
297 int ret;
298
299 idr_preload(gfp_mask);
Tejun Heo54504e92014-05-13 12:10:59 -0400300 spin_lock_bh(&cgroup_idr_lock);
Mel Gormand0164ad2015-11-06 16:28:21 -0800301 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
Tejun Heo54504e92014-05-13 12:10:59 -0400302 spin_unlock_bh(&cgroup_idr_lock);
Tejun Heo6fa49182014-05-04 15:09:13 -0400303 idr_preload_end();
304 return ret;
305}
306
307static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
308{
309 void *ret;
310
Tejun Heo54504e92014-05-13 12:10:59 -0400311 spin_lock_bh(&cgroup_idr_lock);
Tejun Heo6fa49182014-05-04 15:09:13 -0400312 ret = idr_replace(idr, ptr, id);
Tejun Heo54504e92014-05-13 12:10:59 -0400313 spin_unlock_bh(&cgroup_idr_lock);
Tejun Heo6fa49182014-05-04 15:09:13 -0400314 return ret;
315}
316
317static void cgroup_idr_remove(struct idr *idr, int id)
318{
Tejun Heo54504e92014-05-13 12:10:59 -0400319 spin_lock_bh(&cgroup_idr_lock);
Tejun Heo6fa49182014-05-04 15:09:13 -0400320 idr_remove(idr, id);
Tejun Heo54504e92014-05-13 12:10:59 -0400321 spin_unlock_bh(&cgroup_idr_lock);
Tejun Heo6fa49182014-05-04 15:09:13 -0400322}
323
Tejun Heo27f26752017-07-16 21:44:18 -0400324static bool cgroup_has_tasks(struct cgroup *cgrp)
Tejun Heod51f39b2014-05-16 13:22:48 -0400325{
Tejun Heo27f26752017-07-16 21:44:18 -0400326 return cgrp->nr_populated_csets;
327}
Tejun Heod51f39b2014-05-16 13:22:48 -0400328
Waiman Long7a0cf0e2017-07-21 11:14:51 -0400329bool cgroup_is_threaded(struct cgroup *cgrp)
Tejun Heo454000a2017-05-15 09:34:02 -0400330{
331 return cgrp->dom_cgrp != cgrp;
332}
333
Tejun Heo8cfd8142017-07-21 11:14:51 -0400334/* can @cgrp host both domain and threaded children? */
335static bool cgroup_is_mixable(struct cgroup *cgrp)
336{
337 /*
338 * Root isn't under domain level resource control exempting it from
339 * the no-internal-process constraint, so it can serve as a thread
340 * root and a parent of resource domains at the same time.
341 */
342 return !cgroup_parent(cgrp);
343}
344
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +0530345/* can @cgrp become a thread root? Should always be true for a thread root */
Tejun Heo8cfd8142017-07-21 11:14:51 -0400346static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
347{
348 /* mixables don't care */
349 if (cgroup_is_mixable(cgrp))
350 return true;
351
352 /* domain roots can't be nested under threaded */
353 if (cgroup_is_threaded(cgrp))
354 return false;
355
356 /* can only have either domain or threaded children */
357 if (cgrp->nr_populated_domain_children)
358 return false;
359
360 /* and no domain controllers can be enabled */
361 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
362 return false;
363
364 return true;
365}
366
367/* is @cgrp root of a threaded subtree? */
Waiman Long7a0cf0e2017-07-21 11:14:51 -0400368bool cgroup_is_thread_root(struct cgroup *cgrp)
Tejun Heo8cfd8142017-07-21 11:14:51 -0400369{
370 /* thread root should be a domain */
371 if (cgroup_is_threaded(cgrp))
372 return false;
373
374 /* a domain w/ threaded children is a thread root */
375 if (cgrp->nr_threaded_children)
376 return true;
377
378 /*
379 * A domain which has tasks and explicit threaded controllers
380 * enabled is a thread root.
381 */
382 if (cgroup_has_tasks(cgrp) &&
383 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
384 return true;
385
386 return false;
387}
388
389/* a domain which isn't connected to the root w/o brekage can't be used */
390static bool cgroup_is_valid_domain(struct cgroup *cgrp)
391{
392 /* the cgroup itself can be a thread root */
393 if (cgroup_is_threaded(cgrp))
394 return false;
395
396 /* but the ancestors can't be unless mixable */
397 while ((cgrp = cgroup_parent(cgrp))) {
398 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
399 return false;
400 if (cgroup_is_threaded(cgrp))
401 return false;
402 }
403
404 return true;
Tejun Heod51f39b2014-05-16 13:22:48 -0400405}
406
Tejun Heo5531dc92016-03-03 09:57:58 -0500407/* subsystems visibly enabled on a cgroup */
408static u16 cgroup_control(struct cgroup *cgrp)
409{
410 struct cgroup *parent = cgroup_parent(cgrp);
411 u16 root_ss_mask = cgrp->root->subsys_mask;
412
Tejun Heo8cfd8142017-07-21 11:14:51 -0400413 if (parent) {
414 u16 ss_mask = parent->subtree_control;
415
416 /* threaded cgroups can only have threaded controllers */
417 if (cgroup_is_threaded(cgrp))
418 ss_mask &= cgrp_dfl_threaded_ss_mask;
419 return ss_mask;
420 }
Tejun Heo5531dc92016-03-03 09:57:58 -0500421
422 if (cgroup_on_dfl(cgrp))
Tejun Heof6d635ad2016-03-08 11:51:26 -0500423 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
424 cgrp_dfl_implicit_ss_mask);
Tejun Heo5531dc92016-03-03 09:57:58 -0500425 return root_ss_mask;
426}
427
428/* subsystems enabled on a cgroup */
429static u16 cgroup_ss_mask(struct cgroup *cgrp)
430{
431 struct cgroup *parent = cgroup_parent(cgrp);
432
Tejun Heo8cfd8142017-07-21 11:14:51 -0400433 if (parent) {
434 u16 ss_mask = parent->subtree_ss_mask;
435
436 /* threaded cgroups can only have threaded controllers */
437 if (cgroup_is_threaded(cgrp))
438 ss_mask &= cgrp_dfl_threaded_ss_mask;
439 return ss_mask;
440 }
Tejun Heo5531dc92016-03-03 09:57:58 -0500441
442 return cgrp->root->subsys_mask;
443}
444
Tejun Heo95109b62013-08-08 20:11:27 -0400445/**
446 * cgroup_css - obtain a cgroup's css for the specified subsystem
447 * @cgrp: the cgroup of interest
Tejun Heo9d800df2014-05-14 09:15:00 -0400448 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
Tejun Heo95109b62013-08-08 20:11:27 -0400449 *
Tejun Heoca8bdca2013-08-26 18:40:56 -0400450 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
451 * function must be called either under cgroup_mutex or rcu_read_lock() and
452 * the caller is responsible for pinning the returned css if it wants to
453 * keep accessing it outside the said locks. This function may return
454 * %NULL if @cgrp doesn't have @subsys_id enabled.
Tejun Heo95109b62013-08-08 20:11:27 -0400455 */
456static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
Tejun Heoca8bdca2013-08-26 18:40:56 -0400457 struct cgroup_subsys *ss)
Tejun Heo95109b62013-08-08 20:11:27 -0400458{
Tejun Heoca8bdca2013-08-26 18:40:56 -0400459 if (ss)
Tejun Heoaec25022014-02-08 10:36:58 -0500460 return rcu_dereference_check(cgrp->subsys[ss->id],
Tejun Heoace2bee2014-02-11 11:52:47 -0500461 lockdep_is_held(&cgroup_mutex));
Tejun Heoca8bdca2013-08-26 18:40:56 -0400462 else
Tejun Heo9d800df2014-05-14 09:15:00 -0400463 return &cgrp->self;
Tejun Heo95109b62013-08-08 20:11:27 -0400464}
Paul Menageddbcc7e2007-10-18 23:39:30 -0700465
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400466/**
Tejun Heod41bf8c2017-10-23 16:18:27 -0700467 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
468 * @cgrp: the cgroup of interest
469 * @ss: the subsystem of interest
470 *
471 * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
472 * or is offline, %NULL is returned.
473 */
474static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
475 struct cgroup_subsys *ss)
476{
477 struct cgroup_subsys_state *css;
478
479 rcu_read_lock();
480 css = cgroup_css(cgrp, ss);
Peng Wanga5815632019-07-03 10:07:49 +0800481 if (css && !css_tryget_online(css))
Tejun Heod41bf8c2017-10-23 16:18:27 -0700482 css = NULL;
483 rcu_read_unlock();
484
485 return css;
486}
487
488/**
Dennis Zhoufc5a8282018-12-05 12:10:36 -0500489 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400490 * @cgrp: the cgroup of interest
Tejun Heo9d800df2014-05-14 09:15:00 -0400491 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400492 *
Chen Hanxiaod0f702e2015-04-23 07:57:33 -0400493 * Similar to cgroup_css() but returns the effective css, which is defined
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400494 * as the matching css of the nearest ancestor including self which has @ss
495 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
496 * function is guaranteed to return non-NULL css.
497 */
Dennis Zhoufc5a8282018-12-05 12:10:36 -0500498static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
499 struct cgroup_subsys *ss)
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400500{
501 lockdep_assert_held(&cgroup_mutex);
502
503 if (!ss)
Tejun Heo9d800df2014-05-14 09:15:00 -0400504 return &cgrp->self;
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400505
Tejun Heoeeecbd12014-11-18 02:49:52 -0500506 /*
507 * This function is used while updating css associations and thus
Tejun Heo5531dc92016-03-03 09:57:58 -0500508 * can't test the csses directly. Test ss_mask.
Tejun Heoeeecbd12014-11-18 02:49:52 -0500509 */
Tejun Heo5531dc92016-03-03 09:57:58 -0500510 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
Tejun Heod51f39b2014-05-16 13:22:48 -0400511 cgrp = cgroup_parent(cgrp);
Tejun Heo5531dc92016-03-03 09:57:58 -0500512 if (!cgrp)
513 return NULL;
514 }
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400515
516 return cgroup_css(cgrp, ss);
Paul Menageddbcc7e2007-10-18 23:39:30 -0700517}
518
Tejun Heoeeecbd12014-11-18 02:49:52 -0500519/**
Dennis Zhoufc5a8282018-12-05 12:10:36 -0500520 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
521 * @cgrp: the cgroup of interest
522 * @ss: the subsystem of interest
523 *
524 * Find and get the effective css of @cgrp for @ss. The effective css is
525 * defined as the matching css of the nearest ancestor including self which
526 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
527 * the root css is returned, so this function always returns a valid css.
528 *
529 * The returned css is not guaranteed to be online, and therefore it is the
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +0530530 * callers responsibility to try get a reference for it.
Dennis Zhoufc5a8282018-12-05 12:10:36 -0500531 */
532struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
533 struct cgroup_subsys *ss)
534{
535 struct cgroup_subsys_state *css;
536
537 do {
538 css = cgroup_css(cgrp, ss);
539
540 if (css)
541 return css;
542 cgrp = cgroup_parent(cgrp);
543 } while (cgrp);
544
545 return init_css_set.subsys[ss->id];
546}
547
548/**
Tejun Heoeeecbd12014-11-18 02:49:52 -0500549 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
550 * @cgrp: the cgroup of interest
551 * @ss: the subsystem of interest
552 *
553 * Find and get the effective css of @cgrp for @ss. The effective css is
554 * defined as the matching css of the nearest ancestor including self which
555 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
556 * the root css is returned, so this function always returns a valid css.
557 * The returned css must be put using css_put().
558 */
559struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
560 struct cgroup_subsys *ss)
561{
562 struct cgroup_subsys_state *css;
563
564 rcu_read_lock();
565
566 do {
567 css = cgroup_css(cgrp, ss);
568
569 if (css && css_tryget_online(css))
570 goto out_unlock;
571 cgrp = cgroup_parent(cgrp);
572 } while (cgrp);
573
574 css = init_css_set.subsys[ss->id];
575 css_get(css);
576out_unlock:
577 rcu_read_unlock();
578 return css;
579}
580
Tejun Heoa590b902017-04-28 15:14:55 -0400581static void cgroup_get_live(struct cgroup *cgrp)
Tejun Heo052c3f32015-10-15 16:41:50 -0400582{
583 WARN_ON_ONCE(cgroup_is_dead(cgrp));
584 css_get(&cgrp->self);
585}
586
Roman Gushchinaade7f92019-04-19 10:03:02 -0700587/**
588 * __cgroup_task_count - count the number of tasks in a cgroup. The caller
589 * is responsible for taking the css_set_lock.
590 * @cgrp: the cgroup in question
591 */
592int __cgroup_task_count(const struct cgroup *cgrp)
593{
594 int count = 0;
595 struct cgrp_cset_link *link;
596
597 lockdep_assert_held(&css_set_lock);
598
599 list_for_each_entry(link, &cgrp->cset_links, cset_link)
600 count += link->cset->nr_tasks;
601
602 return count;
603}
604
605/**
606 * cgroup_task_count - count the number of tasks in a cgroup.
607 * @cgrp: the cgroup in question
608 */
609int cgroup_task_count(const struct cgroup *cgrp)
610{
611 int count;
612
613 spin_lock_irq(&css_set_lock);
614 count = __cgroup_task_count(cgrp);
615 spin_unlock_irq(&css_set_lock);
616
617 return count;
618}
619
Tejun Heob4168642014-05-13 12:16:21 -0400620struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
Tejun Heo59f52962014-02-11 11:52:49 -0500621{
Tejun Heo2bd59d42014-02-11 11:52:49 -0500622 struct cgroup *cgrp = of->kn->parent->priv;
Tejun Heob4168642014-05-13 12:16:21 -0400623 struct cftype *cft = of_cft(of);
Tejun Heo2bd59d42014-02-11 11:52:49 -0500624
625 /*
626 * This is open and unprotected implementation of cgroup_css().
627 * seq_css() is only called from a kernfs file operation which has
628 * an active reference on the file. Because all the subsystem
629 * files are drained before a css is disassociated with a cgroup,
630 * the matching css from the cgroup's subsys table is guaranteed to
631 * be and stay valid until the enclosing operation is complete.
632 */
633 if (cft->ss)
634 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
635 else
Tejun Heo9d800df2014-05-14 09:15:00 -0400636 return &cgrp->self;
Tejun Heo59f52962014-02-11 11:52:49 -0500637}
Tejun Heob4168642014-05-13 12:16:21 -0400638EXPORT_SYMBOL_GPL(of_css);
Tejun Heo59f52962014-02-11 11:52:49 -0500639
Tejun Heo30159ec2013-06-25 11:53:37 -0700640/**
Tejun Heo1c6727a2013-12-06 15:11:56 -0500641 * for_each_css - iterate all css's of a cgroup
642 * @css: the iteration cursor
643 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
644 * @cgrp: the target cgroup to iterate css's of
Tejun Heo30159ec2013-06-25 11:53:37 -0700645 *
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400646 * Should be called under cgroup_[tree_]mutex.
Tejun Heo30159ec2013-06-25 11:53:37 -0700647 */
Tejun Heo1c6727a2013-12-06 15:11:56 -0500648#define for_each_css(css, ssid, cgrp) \
649 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
650 if (!((css) = rcu_dereference_check( \
651 (cgrp)->subsys[(ssid)], \
652 lockdep_is_held(&cgroup_mutex)))) { } \
653 else
654
655/**
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400656 * for_each_e_css - iterate all effective css's of a cgroup
657 * @css: the iteration cursor
658 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
659 * @cgrp: the target cgroup to iterate css's of
660 *
661 * Should be called under cgroup_[tree_]mutex.
662 */
Dennis Zhoufc5a8282018-12-05 12:10:36 -0500663#define for_each_e_css(css, ssid, cgrp) \
664 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
665 if (!((css) = cgroup_e_css_by_mask(cgrp, \
666 cgroup_subsys[(ssid)]))) \
667 ; \
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400668 else
669
670/**
Tejun Heob4e0eea2016-02-22 22:25:46 -0500671 * do_each_subsys_mask - filter for_each_subsys with a bitmask
Aleksa Saraicb4a3162015-06-06 10:02:14 +1000672 * @ss: the iteration cursor
673 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
Tejun Heob4e0eea2016-02-22 22:25:46 -0500674 * @ss_mask: the bitmask
Aleksa Saraicb4a3162015-06-06 10:02:14 +1000675 *
676 * The block will only run for cases where the ssid-th bit (1 << ssid) of
Tejun Heob4e0eea2016-02-22 22:25:46 -0500677 * @ss_mask is set.
Aleksa Saraicb4a3162015-06-06 10:02:14 +1000678 */
Tejun Heob4e0eea2016-02-22 22:25:46 -0500679#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
680 unsigned long __ss_mask = (ss_mask); \
681 if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
Aleksa Sarai4a705c52015-06-09 21:32:07 +1000682 (ssid) = 0; \
Tejun Heob4e0eea2016-02-22 22:25:46 -0500683 break; \
684 } \
685 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
686 (ss) = cgroup_subsys[ssid]; \
687 {
688
689#define while_each_subsys_mask() \
690 } \
691 } \
692} while (false)
Aleksa Saraicb4a3162015-06-06 10:02:14 +1000693
Tejun Heof8f22e52014-04-23 11:13:16 -0400694/* iterate over child cgrps, lock should be held throughout iteration */
695#define cgroup_for_each_live_child(child, cgrp) \
Tejun Heod5c419b2014-05-16 13:22:48 -0400696 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
Tejun Heo8353da12014-05-13 12:19:23 -0400697 if (({ lockdep_assert_held(&cgroup_mutex); \
Tejun Heof8f22e52014-04-23 11:13:16 -0400698 cgroup_is_dead(child); })) \
699 ; \
700 else
Tejun Heo7ae1bad2013-04-07 09:29:51 -0700701
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +0530702/* walk live descendants in pre order */
Tejun Heoce3f1d92016-03-03 09:57:59 -0500703#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
704 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
705 if (({ lockdep_assert_held(&cgroup_mutex); \
706 (dsct) = (d_css)->cgroup; \
707 cgroup_is_dead(dsct); })) \
708 ; \
709 else
710
711/* walk live descendants in postorder */
712#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
713 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
714 if (({ lockdep_assert_held(&cgroup_mutex); \
715 (dsct) = (d_css)->cgroup; \
716 cgroup_is_dead(dsct); })) \
717 ; \
718 else
719
Tejun Heo172a2c062014-03-19 10:23:53 -0400720/*
721 * The default css_set - used by init and its children prior to any
Paul Menage817929e2007-10-18 23:39:36 -0700722 * hierarchies being mounted. It contains a pointer to the root state
723 * for each subsystem. Also used to anchor the list of css_sets. Not
724 * reference-counted, to improve performance when child cgroups
725 * haven't been created.
726 */
Tejun Heo5024ae22014-05-07 21:31:17 -0400727struct css_set init_css_set = {
Elena Reshetova4b9502e62017-03-08 10:00:40 +0200728 .refcount = REFCOUNT_INIT(1),
Tejun Heo454000a2017-05-15 09:34:02 -0400729 .dom_cset = &init_css_set,
Tejun Heo172a2c062014-03-19 10:23:53 -0400730 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
731 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
Tejun Heoc03cd772019-05-31 10:38:58 -0700732 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
Tejun Heo5f617ebb2016-12-27 14:49:05 -0500733 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
Tejun Heo454000a2017-05-15 09:34:02 -0400734 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
Tejun Heo5f617ebb2016-12-27 14:49:05 -0500735 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
Tejun Heo172a2c062014-03-19 10:23:53 -0400736 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
737 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
Tejun Heo38683142017-09-25 13:50:20 -0700738
739 /*
740 * The following field is re-initialized when this cset gets linked
741 * in cgroup_init(). However, let's initialize the field
742 * statically too so that the default cgroup can be accessed safely
743 * early during boot.
744 */
745 .dfl_cgrp = &cgrp_dfl_root.cgrp,
Tejun Heo172a2c062014-03-19 10:23:53 -0400746};
Paul Menage817929e2007-10-18 23:39:36 -0700747
Tejun Heo172a2c062014-03-19 10:23:53 -0400748static int css_set_count = 1; /* 1 for init_css_set */
Paul Menage817929e2007-10-18 23:39:36 -0700749
Tejun Heo454000a2017-05-15 09:34:02 -0400750static bool css_set_threaded(struct css_set *cset)
751{
752 return cset->dom_cset != cset;
753}
754
Tejun Heo842b5972014-04-25 18:28:02 -0400755/**
Tejun Heo0de09422015-10-15 16:41:49 -0400756 * css_set_populated - does a css_set contain any tasks?
757 * @cset: target css_set
Waiman Long73a72422017-06-13 17:18:01 -0400758 *
759 * css_set_populated() should be the same as !!cset->nr_tasks at steady
760 * state. However, css_set_populated() can be called while a task is being
761 * added to or removed from the linked list before the nr_tasks is
762 * properly updated. Hence, we can't just look at ->nr_tasks here.
Tejun Heo0de09422015-10-15 16:41:49 -0400763 */
764static bool css_set_populated(struct css_set *cset)
765{
Tejun Heof0d9a5f2015-10-15 16:41:53 -0400766 lockdep_assert_held(&css_set_lock);
Tejun Heo0de09422015-10-15 16:41:49 -0400767
768 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
769}
770
771/**
Tejun Heo788b9502017-07-16 21:43:33 -0400772 * cgroup_update_populated - update the populated count of a cgroup
Tejun Heo842b5972014-04-25 18:28:02 -0400773 * @cgrp: the target cgroup
774 * @populated: inc or dec populated count
775 *
Tejun Heo0de09422015-10-15 16:41:49 -0400776 * One of the css_sets associated with @cgrp is either getting its first
Tejun Heo788b9502017-07-16 21:43:33 -0400777 * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
778 * count is propagated towards root so that a given cgroup's
779 * nr_populated_children is zero iff none of its descendants contain any
780 * tasks.
Tejun Heo842b5972014-04-25 18:28:02 -0400781 *
Tejun Heo788b9502017-07-16 21:43:33 -0400782 * @cgrp's interface file "cgroup.populated" is zero if both
783 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
784 * 1 otherwise. When the sum changes from or to zero, userland is notified
785 * that the content of the interface file has changed. This can be used to
786 * detect when @cgrp and its descendants become populated or empty.
Tejun Heo842b5972014-04-25 18:28:02 -0400787 */
788static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
789{
Tejun Heo788b9502017-07-16 21:43:33 -0400790 struct cgroup *child = NULL;
791 int adj = populated ? 1 : -1;
792
Tejun Heof0d9a5f2015-10-15 16:41:53 -0400793 lockdep_assert_held(&css_set_lock);
Tejun Heo842b5972014-04-25 18:28:02 -0400794
795 do {
Tejun Heo788b9502017-07-16 21:43:33 -0400796 bool was_populated = cgroup_is_populated(cgrp);
Tejun Heo842b5972014-04-25 18:28:02 -0400797
Tejun Heo454000a2017-05-15 09:34:02 -0400798 if (!child) {
Tejun Heo788b9502017-07-16 21:43:33 -0400799 cgrp->nr_populated_csets += adj;
Tejun Heo454000a2017-05-15 09:34:02 -0400800 } else {
801 if (cgroup_is_threaded(child))
802 cgrp->nr_populated_threaded_children += adj;
803 else
804 cgrp->nr_populated_domain_children += adj;
805 }
Tejun Heo842b5972014-04-25 18:28:02 -0400806
Tejun Heo788b9502017-07-16 21:43:33 -0400807 if (was_populated == cgroup_is_populated(cgrp))
Tejun Heo842b5972014-04-25 18:28:02 -0400808 break;
809
Tejun Heod62beb72016-12-27 14:49:08 -0500810 cgroup1_check_for_release(cgrp);
Roman Gushchin4c476d82019-04-19 10:03:08 -0700811 TRACE_CGROUP_PATH(notify_populated, cgrp,
812 cgroup_is_populated(cgrp));
Tejun Heo6f60ead2015-09-18 17:54:23 -0400813 cgroup_file_notify(&cgrp->events_file);
814
Tejun Heo788b9502017-07-16 21:43:33 -0400815 child = cgrp;
Tejun Heod51f39b2014-05-16 13:22:48 -0400816 cgrp = cgroup_parent(cgrp);
Tejun Heo842b5972014-04-25 18:28:02 -0400817 } while (cgrp);
818}
819
Tejun Heo0de09422015-10-15 16:41:49 -0400820/**
821 * css_set_update_populated - update populated state of a css_set
822 * @cset: target css_set
823 * @populated: whether @cset is populated or depopulated
824 *
825 * @cset is either getting the first task or losing the last. Update the
Tejun Heo788b9502017-07-16 21:43:33 -0400826 * populated counters of all associated cgroups accordingly.
Tejun Heo0de09422015-10-15 16:41:49 -0400827 */
828static void css_set_update_populated(struct css_set *cset, bool populated)
829{
830 struct cgrp_cset_link *link;
831
Tejun Heof0d9a5f2015-10-15 16:41:53 -0400832 lockdep_assert_held(&css_set_lock);
Tejun Heo0de09422015-10-15 16:41:49 -0400833
834 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
835 cgroup_update_populated(link->cgrp, populated);
836}
837
Tejun Heob636fd32019-05-31 10:38:58 -0700838/*
839 * @task is leaving, advance task iterators which are pointing to it so
840 * that they can resume at the next position. Advancing an iterator might
841 * remove it from the list, use safe walk. See css_task_iter_skip() for
842 * details.
843 */
844static void css_set_skip_task_iters(struct css_set *cset,
845 struct task_struct *task)
846{
847 struct css_task_iter *it, *pos;
848
849 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
850 css_task_iter_skip(it, task);
851}
852
Tejun Heof6d7d042015-10-15 16:41:52 -0400853/**
854 * css_set_move_task - move a task from one css_set to another
855 * @task: task being moved
856 * @from_cset: css_set @task currently belongs to (may be NULL)
857 * @to_cset: new css_set @task is being moved to (may be NULL)
858 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
859 *
860 * Move @task from @from_cset to @to_cset. If @task didn't belong to any
861 * css_set, @from_cset can be NULL. If @task is being disassociated
862 * instead of moved, @to_cset can be NULL.
863 *
Tejun Heo788b9502017-07-16 21:43:33 -0400864 * This function automatically handles populated counter updates and
Tejun Heoed27b9f2015-10-15 16:41:52 -0400865 * css_task_iter adjustments but the caller is responsible for managing
866 * @from_cset and @to_cset's reference counts.
Tejun Heof6d7d042015-10-15 16:41:52 -0400867 */
868static void css_set_move_task(struct task_struct *task,
869 struct css_set *from_cset, struct css_set *to_cset,
870 bool use_mg_tasks)
871{
Tejun Heof0d9a5f2015-10-15 16:41:53 -0400872 lockdep_assert_held(&css_set_lock);
Tejun Heof6d7d042015-10-15 16:41:52 -0400873
Tejun Heo20b454a2016-03-03 09:57:57 -0500874 if (to_cset && !css_set_populated(to_cset))
875 css_set_update_populated(to_cset, true);
876
Tejun Heof6d7d042015-10-15 16:41:52 -0400877 if (from_cset) {
878 WARN_ON_ONCE(list_empty(&task->cg_list));
Tejun Heoed27b9f2015-10-15 16:41:52 -0400879
Tejun Heob636fd32019-05-31 10:38:58 -0700880 css_set_skip_task_iters(from_cset, task);
Tejun Heof6d7d042015-10-15 16:41:52 -0400881 list_del_init(&task->cg_list);
882 if (!css_set_populated(from_cset))
883 css_set_update_populated(from_cset, false);
884 } else {
885 WARN_ON_ONCE(!list_empty(&task->cg_list));
886 }
887
888 if (to_cset) {
889 /*
890 * We are synchronized through cgroup_threadgroup_rwsem
891 * against PF_EXITING setting such that we can't race
Michal Koutnýe7c7b1d2019-10-04 12:57:39 +0200892 * against cgroup_exit()/cgroup_free() dropping the css_set.
Tejun Heof6d7d042015-10-15 16:41:52 -0400893 */
894 WARN_ON_ONCE(task->flags & PF_EXITING);
895
Johannes Weiner2ce71352018-10-26 15:06:31 -0700896 cgroup_move_task(task, to_cset);
Tejun Heof6d7d042015-10-15 16:41:52 -0400897 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
898 &to_cset->tasks);
899 }
900}
901
Paul Menage7717f7b2009-09-23 15:56:22 -0700902/*
903 * hash table for cgroup groups. This improves the performance to find
904 * an existing css_set. This hash doesn't (currently) take into
905 * account cgroups in empty hierarchies.
906 */
Li Zefan472b1052008-04-29 01:00:11 -0700907#define CSS_SET_HASH_BITS 7
Li Zefan0ac801f2013-01-10 11:49:27 +0800908static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
Li Zefan472b1052008-04-29 01:00:11 -0700909
Li Zefan0ac801f2013-01-10 11:49:27 +0800910static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
Li Zefan472b1052008-04-29 01:00:11 -0700911{
Li Zefan0ac801f2013-01-10 11:49:27 +0800912 unsigned long key = 0UL;
Tejun Heo30159ec2013-06-25 11:53:37 -0700913 struct cgroup_subsys *ss;
914 int i;
Li Zefan472b1052008-04-29 01:00:11 -0700915
Tejun Heo30159ec2013-06-25 11:53:37 -0700916 for_each_subsys(ss, i)
Li Zefan0ac801f2013-01-10 11:49:27 +0800917 key += (unsigned long)css[i];
918 key = (key >> 16) ^ key;
Li Zefan472b1052008-04-29 01:00:11 -0700919
Li Zefan0ac801f2013-01-10 11:49:27 +0800920 return key;
Li Zefan472b1052008-04-29 01:00:11 -0700921}
922
Tejun Heodcfe1492016-12-27 14:49:09 -0500923void put_css_set_locked(struct css_set *cset)
Paul Menageb4f48b62007-10-18 23:39:33 -0700924{
Tejun Heo69d02062013-06-12 21:04:50 -0700925 struct cgrp_cset_link *link, *tmp_link;
Tejun Heo2d8f2432014-04-23 11:13:15 -0400926 struct cgroup_subsys *ss;
927 int ssid;
Tejun Heo5abb8852013-06-12 21:04:49 -0700928
Tejun Heof0d9a5f2015-10-15 16:41:53 -0400929 lockdep_assert_held(&css_set_lock);
Tejun Heo89c55092014-02-13 06:58:40 -0500930
Elena Reshetova4b9502e62017-03-08 10:00:40 +0200931 if (!refcount_dec_and_test(&cset->refcount))
Lai Jiangshan146aa1b2008-10-18 20:28:03 -0700932 return;
Paul Menage81a6a5c2007-10-18 23:39:38 -0700933
Tejun Heo454000a2017-05-15 09:34:02 -0400934 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
935
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +0530936 /* This css_set is dead. Unlink it and release cgroup and css refs */
Tejun Heo53254f92015-11-23 14:55:41 -0500937 for_each_subsys(ss, ssid) {
Tejun Heo2d8f2432014-04-23 11:13:15 -0400938 list_del(&cset->e_cset_node[ssid]);
Tejun Heo53254f92015-11-23 14:55:41 -0500939 css_put(cset->subsys[ssid]);
940 }
Tejun Heo5abb8852013-06-12 21:04:49 -0700941 hash_del(&cset->hlist);
Paul Menage2c6ab6d2009-09-23 15:56:23 -0700942 css_set_count--;
943
Tejun Heo69d02062013-06-12 21:04:50 -0700944 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
Tejun Heo69d02062013-06-12 21:04:50 -0700945 list_del(&link->cset_link);
946 list_del(&link->cgrp_link);
Tejun Heo2ceb2312015-10-15 16:41:51 -0400947 if (cgroup_parent(link->cgrp))
948 cgroup_put(link->cgrp);
Paul Menage2c6ab6d2009-09-23 15:56:23 -0700949 kfree(link);
Paul Menage81a6a5c2007-10-18 23:39:38 -0700950 }
Paul Menage2c6ab6d2009-09-23 15:56:23 -0700951
Tejun Heo454000a2017-05-15 09:34:02 -0400952 if (css_set_threaded(cset)) {
953 list_del(&cset->threaded_csets_node);
954 put_css_set_locked(cset->dom_cset);
955 }
956
Tejun Heo5abb8852013-06-12 21:04:49 -0700957 kfree_rcu(cset, rcu_head);
Paul Menage817929e2007-10-18 23:39:36 -0700958}
959
Tejun Heob326f9d2013-06-24 15:21:48 -0700960/**
Paul Menage7717f7b2009-09-23 15:56:22 -0700961 * compare_css_sets - helper function for find_existing_css_set().
Tejun Heo5abb8852013-06-12 21:04:49 -0700962 * @cset: candidate css_set being tested
963 * @old_cset: existing css_set for a task
Paul Menage7717f7b2009-09-23 15:56:22 -0700964 * @new_cgrp: cgroup that's being entered by the task
965 * @template: desired set of css pointers in css_set (pre-calculated)
966 *
Li Zefan6f4b7e62013-07-31 16:18:36 +0800967 * Returns true if "cset" matches "old_cset" except for the hierarchy
Paul Menage7717f7b2009-09-23 15:56:22 -0700968 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
969 */
Tejun Heo5abb8852013-06-12 21:04:49 -0700970static bool compare_css_sets(struct css_set *cset,
971 struct css_set *old_cset,
Paul Menage7717f7b2009-09-23 15:56:22 -0700972 struct cgroup *new_cgrp,
973 struct cgroup_subsys_state *template[])
974{
Tejun Heo454000a2017-05-15 09:34:02 -0400975 struct cgroup *new_dfl_cgrp;
Paul Menage7717f7b2009-09-23 15:56:22 -0700976 struct list_head *l1, *l2;
977
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400978 /*
979 * On the default hierarchy, there can be csets which are
980 * associated with the same set of cgroups but different csses.
981 * Let's first ensure that csses match.
982 */
983 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
Paul Menage7717f7b2009-09-23 15:56:22 -0700984 return false;
Paul Menage7717f7b2009-09-23 15:56:22 -0700985
Tejun Heo454000a2017-05-15 09:34:02 -0400986
987 /* @cset's domain should match the default cgroup's */
988 if (cgroup_on_dfl(new_cgrp))
989 new_dfl_cgrp = new_cgrp;
990 else
991 new_dfl_cgrp = old_cset->dfl_cgrp;
992
993 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
994 return false;
995
Paul Menage7717f7b2009-09-23 15:56:22 -0700996 /*
997 * Compare cgroup pointers in order to distinguish between
Tejun Heoaec3dfc2014-04-23 11:13:14 -0400998 * different cgroups in hierarchies. As different cgroups may
999 * share the same effective css, this comparison is always
1000 * necessary.
Paul Menage7717f7b2009-09-23 15:56:22 -07001001 */
Tejun Heo69d02062013-06-12 21:04:50 -07001002 l1 = &cset->cgrp_links;
1003 l2 = &old_cset->cgrp_links;
Paul Menage7717f7b2009-09-23 15:56:22 -07001004 while (1) {
Tejun Heo69d02062013-06-12 21:04:50 -07001005 struct cgrp_cset_link *link1, *link2;
Tejun Heo5abb8852013-06-12 21:04:49 -07001006 struct cgroup *cgrp1, *cgrp2;
Paul Menage7717f7b2009-09-23 15:56:22 -07001007
1008 l1 = l1->next;
1009 l2 = l2->next;
1010 /* See if we reached the end - both lists are equal length. */
Tejun Heo69d02062013-06-12 21:04:50 -07001011 if (l1 == &cset->cgrp_links) {
1012 BUG_ON(l2 != &old_cset->cgrp_links);
Paul Menage7717f7b2009-09-23 15:56:22 -07001013 break;
1014 } else {
Tejun Heo69d02062013-06-12 21:04:50 -07001015 BUG_ON(l2 == &old_cset->cgrp_links);
Paul Menage7717f7b2009-09-23 15:56:22 -07001016 }
1017 /* Locate the cgroups associated with these links. */
Tejun Heo69d02062013-06-12 21:04:50 -07001018 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1019 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1020 cgrp1 = link1->cgrp;
1021 cgrp2 = link2->cgrp;
Paul Menage7717f7b2009-09-23 15:56:22 -07001022 /* Hierarchies should be linked in the same order. */
Tejun Heo5abb8852013-06-12 21:04:49 -07001023 BUG_ON(cgrp1->root != cgrp2->root);
Paul Menage7717f7b2009-09-23 15:56:22 -07001024
1025 /*
1026 * If this hierarchy is the hierarchy of the cgroup
1027 * that's changing, then we need to check that this
1028 * css_set points to the new cgroup; if it's any other
1029 * hierarchy, then this css_set should point to the
1030 * same cgroup as the old css_set.
1031 */
Tejun Heo5abb8852013-06-12 21:04:49 -07001032 if (cgrp1->root == new_cgrp->root) {
1033 if (cgrp1 != new_cgrp)
Paul Menage7717f7b2009-09-23 15:56:22 -07001034 return false;
1035 } else {
Tejun Heo5abb8852013-06-12 21:04:49 -07001036 if (cgrp1 != cgrp2)
Paul Menage7717f7b2009-09-23 15:56:22 -07001037 return false;
1038 }
1039 }
1040 return true;
1041}
1042
Tejun Heob326f9d2013-06-24 15:21:48 -07001043/**
1044 * find_existing_css_set - init css array and find the matching css_set
1045 * @old_cset: the css_set that we're using before the cgroup transition
1046 * @cgrp: the cgroup that we're moving into
1047 * @template: out param for the new set of csses, should be clear on entry
Paul Menage817929e2007-10-18 23:39:36 -07001048 */
Tejun Heo5abb8852013-06-12 21:04:49 -07001049static struct css_set *find_existing_css_set(struct css_set *old_cset,
1050 struct cgroup *cgrp,
1051 struct cgroup_subsys_state *template[])
Paul Menage817929e2007-10-18 23:39:36 -07001052{
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001053 struct cgroup_root *root = cgrp->root;
Tejun Heo30159ec2013-06-25 11:53:37 -07001054 struct cgroup_subsys *ss;
Tejun Heo5abb8852013-06-12 21:04:49 -07001055 struct css_set *cset;
Li Zefan0ac801f2013-01-10 11:49:27 +08001056 unsigned long key;
Tejun Heob326f9d2013-06-24 15:21:48 -07001057 int i;
Paul Menage817929e2007-10-18 23:39:36 -07001058
Ben Blumaae8aab2010-03-10 15:22:07 -08001059 /*
1060 * Build the set of subsystem state objects that we want to see in the
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +05301061 * new css_set. While subsystems can change globally, the entries here
Ben Blumaae8aab2010-03-10 15:22:07 -08001062 * won't change, so no need for locking.
1063 */
Tejun Heo30159ec2013-06-25 11:53:37 -07001064 for_each_subsys(ss, i) {
Tejun Heof392e512014-04-23 11:13:14 -04001065 if (root->subsys_mask & (1UL << i)) {
Tejun Heoaec3dfc2014-04-23 11:13:14 -04001066 /*
1067 * @ss is in this hierarchy, so we want the
1068 * effective css from @cgrp.
1069 */
Dennis Zhoufc5a8282018-12-05 12:10:36 -05001070 template[i] = cgroup_e_css_by_mask(cgrp, ss);
Paul Menage817929e2007-10-18 23:39:36 -07001071 } else {
Tejun Heoaec3dfc2014-04-23 11:13:14 -04001072 /*
1073 * @ss is not in this hierarchy, so we don't want
1074 * to change the css.
1075 */
Tejun Heo5abb8852013-06-12 21:04:49 -07001076 template[i] = old_cset->subsys[i];
Paul Menage817929e2007-10-18 23:39:36 -07001077 }
1078 }
1079
Li Zefan0ac801f2013-01-10 11:49:27 +08001080 key = css_set_hash(template);
Tejun Heo5abb8852013-06-12 21:04:49 -07001081 hash_for_each_possible(css_set_table, cset, hlist, key) {
1082 if (!compare_css_sets(cset, old_cset, cgrp, template))
Paul Menage7717f7b2009-09-23 15:56:22 -07001083 continue;
1084
1085 /* This css_set matches what we need */
Tejun Heo5abb8852013-06-12 21:04:49 -07001086 return cset;
Li Zefan472b1052008-04-29 01:00:11 -07001087 }
Paul Menage817929e2007-10-18 23:39:36 -07001088
1089 /* No existing cgroup group matched */
1090 return NULL;
1091}
1092
Tejun Heo69d02062013-06-12 21:04:50 -07001093static void free_cgrp_cset_links(struct list_head *links_to_free)
Paul Menage817929e2007-10-18 23:39:36 -07001094{
Tejun Heo69d02062013-06-12 21:04:50 -07001095 struct cgrp_cset_link *link, *tmp_link;
KOSAKI Motohiro71cbb942008-07-25 01:46:55 -07001096
Tejun Heo69d02062013-06-12 21:04:50 -07001097 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1098 list_del(&link->cset_link);
Paul Menage817929e2007-10-18 23:39:36 -07001099 kfree(link);
1100 }
1101}
1102
Tejun Heo69d02062013-06-12 21:04:50 -07001103/**
1104 * allocate_cgrp_cset_links - allocate cgrp_cset_links
1105 * @count: the number of links to allocate
1106 * @tmp_links: list_head the allocated links are put on
1107 *
1108 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
1109 * through ->cset_link. Returns 0 on success or -errno.
Li Zefan36553432008-07-29 22:33:19 -07001110 */
Tejun Heo69d02062013-06-12 21:04:50 -07001111static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
Li Zefan36553432008-07-29 22:33:19 -07001112{
Tejun Heo69d02062013-06-12 21:04:50 -07001113 struct cgrp_cset_link *link;
Li Zefan36553432008-07-29 22:33:19 -07001114 int i;
Tejun Heo69d02062013-06-12 21:04:50 -07001115
1116 INIT_LIST_HEAD(tmp_links);
1117
Li Zefan36553432008-07-29 22:33:19 -07001118 for (i = 0; i < count; i++) {
Tejun Heof4f4be22013-06-12 21:04:51 -07001119 link = kzalloc(sizeof(*link), GFP_KERNEL);
Li Zefan36553432008-07-29 22:33:19 -07001120 if (!link) {
Tejun Heo69d02062013-06-12 21:04:50 -07001121 free_cgrp_cset_links(tmp_links);
Li Zefan36553432008-07-29 22:33:19 -07001122 return -ENOMEM;
1123 }
Tejun Heo69d02062013-06-12 21:04:50 -07001124 list_add(&link->cset_link, tmp_links);
Li Zefan36553432008-07-29 22:33:19 -07001125 }
1126 return 0;
1127}
1128
Li Zefanc12f65d2009-01-07 18:07:42 -08001129/**
1130 * link_css_set - a helper function to link a css_set to a cgroup
Tejun Heo69d02062013-06-12 21:04:50 -07001131 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
Tejun Heo5abb8852013-06-12 21:04:49 -07001132 * @cset: the css_set to be linked
Li Zefanc12f65d2009-01-07 18:07:42 -08001133 * @cgrp: the destination cgroup
1134 */
Tejun Heo69d02062013-06-12 21:04:50 -07001135static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1136 struct cgroup *cgrp)
Li Zefanc12f65d2009-01-07 18:07:42 -08001137{
Tejun Heo69d02062013-06-12 21:04:50 -07001138 struct cgrp_cset_link *link;
Li Zefanc12f65d2009-01-07 18:07:42 -08001139
Tejun Heo69d02062013-06-12 21:04:50 -07001140 BUG_ON(list_empty(tmp_links));
Tejun Heo6803c002014-04-23 11:13:16 -04001141
1142 if (cgroup_on_dfl(cgrp))
1143 cset->dfl_cgrp = cgrp;
1144
Tejun Heo69d02062013-06-12 21:04:50 -07001145 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1146 link->cset = cset;
Paul Menage7717f7b2009-09-23 15:56:22 -07001147 link->cgrp = cgrp;
Tejun Heo842b5972014-04-25 18:28:02 -04001148
Paul Menage7717f7b2009-09-23 15:56:22 -07001149 /*
Tejun Heo389b9c12015-10-15 16:41:51 -04001150 * Always add links to the tail of the lists so that the lists are
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +05301151 * in chronological order.
Paul Menage7717f7b2009-09-23 15:56:22 -07001152 */
Tejun Heo389b9c12015-10-15 16:41:51 -04001153 list_move_tail(&link->cset_link, &cgrp->cset_links);
Tejun Heo69d02062013-06-12 21:04:50 -07001154 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
Tejun Heo2ceb2312015-10-15 16:41:51 -04001155
1156 if (cgroup_parent(cgrp))
Tejun Heoa590b902017-04-28 15:14:55 -04001157 cgroup_get_live(cgrp);
Li Zefanc12f65d2009-01-07 18:07:42 -08001158}
1159
Tejun Heob326f9d2013-06-24 15:21:48 -07001160/**
1161 * find_css_set - return a new css_set with one cgroup updated
1162 * @old_cset: the baseline css_set
1163 * @cgrp: the cgroup to be updated
1164 *
1165 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
1166 * substituted into the appropriate hierarchy.
Paul Menage817929e2007-10-18 23:39:36 -07001167 */
Tejun Heo5abb8852013-06-12 21:04:49 -07001168static struct css_set *find_css_set(struct css_set *old_cset,
1169 struct cgroup *cgrp)
Paul Menage817929e2007-10-18 23:39:36 -07001170{
Tejun Heob326f9d2013-06-24 15:21:48 -07001171 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
Tejun Heo5abb8852013-06-12 21:04:49 -07001172 struct css_set *cset;
Tejun Heo69d02062013-06-12 21:04:50 -07001173 struct list_head tmp_links;
1174 struct cgrp_cset_link *link;
Tejun Heo2d8f2432014-04-23 11:13:15 -04001175 struct cgroup_subsys *ss;
Li Zefan0ac801f2013-01-10 11:49:27 +08001176 unsigned long key;
Tejun Heo2d8f2432014-04-23 11:13:15 -04001177 int ssid;
Li Zefan472b1052008-04-29 01:00:11 -07001178
Tejun Heob326f9d2013-06-24 15:21:48 -07001179 lockdep_assert_held(&cgroup_mutex);
1180
Paul Menage817929e2007-10-18 23:39:36 -07001181 /* First see if we already have a cgroup group that matches
1182 * the desired set */
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001183 spin_lock_irq(&css_set_lock);
Tejun Heo5abb8852013-06-12 21:04:49 -07001184 cset = find_existing_css_set(old_cset, cgrp, template);
1185 if (cset)
1186 get_css_set(cset);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001187 spin_unlock_irq(&css_set_lock);
Paul Menage817929e2007-10-18 23:39:36 -07001188
Tejun Heo5abb8852013-06-12 21:04:49 -07001189 if (cset)
1190 return cset;
Paul Menage817929e2007-10-18 23:39:36 -07001191
Tejun Heof4f4be22013-06-12 21:04:51 -07001192 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
Tejun Heo5abb8852013-06-12 21:04:49 -07001193 if (!cset)
Paul Menage817929e2007-10-18 23:39:36 -07001194 return NULL;
1195
Tejun Heo69d02062013-06-12 21:04:50 -07001196 /* Allocate all the cgrp_cset_link objects that we'll need */
Tejun Heo9871bf92013-06-24 15:21:47 -07001197 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
Tejun Heo5abb8852013-06-12 21:04:49 -07001198 kfree(cset);
Paul Menage817929e2007-10-18 23:39:36 -07001199 return NULL;
1200 }
1201
Elena Reshetova4b9502e62017-03-08 10:00:40 +02001202 refcount_set(&cset->refcount, 1);
Tejun Heo454000a2017-05-15 09:34:02 -04001203 cset->dom_cset = cset;
Tejun Heo5abb8852013-06-12 21:04:49 -07001204 INIT_LIST_HEAD(&cset->tasks);
Tejun Heoc7561122014-02-25 10:04:01 -05001205 INIT_LIST_HEAD(&cset->mg_tasks);
Tejun Heoc03cd772019-05-31 10:38:58 -07001206 INIT_LIST_HEAD(&cset->dying_tasks);
Tejun Heoed27b9f2015-10-15 16:41:52 -04001207 INIT_LIST_HEAD(&cset->task_iters);
Tejun Heo454000a2017-05-15 09:34:02 -04001208 INIT_LIST_HEAD(&cset->threaded_csets);
Tejun Heo5abb8852013-06-12 21:04:49 -07001209 INIT_HLIST_NODE(&cset->hlist);
Tejun Heo5f617ebb2016-12-27 14:49:05 -05001210 INIT_LIST_HEAD(&cset->cgrp_links);
1211 INIT_LIST_HEAD(&cset->mg_preload_node);
1212 INIT_LIST_HEAD(&cset->mg_node);
Paul Menage817929e2007-10-18 23:39:36 -07001213
1214 /* Copy the set of subsystem state objects generated in
1215 * find_existing_css_set() */
Tejun Heo5abb8852013-06-12 21:04:49 -07001216 memcpy(cset->subsys, template, sizeof(cset->subsys));
Paul Menage817929e2007-10-18 23:39:36 -07001217
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001218 spin_lock_irq(&css_set_lock);
Paul Menage817929e2007-10-18 23:39:36 -07001219 /* Add reference counts and links from the new css_set. */
Tejun Heo69d02062013-06-12 21:04:50 -07001220 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
Paul Menage7717f7b2009-09-23 15:56:22 -07001221 struct cgroup *c = link->cgrp;
Tejun Heo69d02062013-06-12 21:04:50 -07001222
Paul Menage7717f7b2009-09-23 15:56:22 -07001223 if (c->root == cgrp->root)
1224 c = cgrp;
Tejun Heo69d02062013-06-12 21:04:50 -07001225 link_css_set(&tmp_links, cset, c);
Paul Menage7717f7b2009-09-23 15:56:22 -07001226 }
Paul Menage817929e2007-10-18 23:39:36 -07001227
Tejun Heo69d02062013-06-12 21:04:50 -07001228 BUG_ON(!list_empty(&tmp_links));
Paul Menage817929e2007-10-18 23:39:36 -07001229
Paul Menage817929e2007-10-18 23:39:36 -07001230 css_set_count++;
Li Zefan472b1052008-04-29 01:00:11 -07001231
Tejun Heo2d8f2432014-04-23 11:13:15 -04001232 /* Add @cset to the hash table */
Tejun Heo5abb8852013-06-12 21:04:49 -07001233 key = css_set_hash(cset->subsys);
1234 hash_add(css_set_table, &cset->hlist, key);
Li Zefan472b1052008-04-29 01:00:11 -07001235
Tejun Heo53254f92015-11-23 14:55:41 -05001236 for_each_subsys(ss, ssid) {
1237 struct cgroup_subsys_state *css = cset->subsys[ssid];
1238
Tejun Heo2d8f2432014-04-23 11:13:15 -04001239 list_add_tail(&cset->e_cset_node[ssid],
Tejun Heo53254f92015-11-23 14:55:41 -05001240 &css->cgroup->e_csets[ssid]);
1241 css_get(css);
1242 }
Tejun Heo2d8f2432014-04-23 11:13:15 -04001243
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001244 spin_unlock_irq(&css_set_lock);
Paul Menage817929e2007-10-18 23:39:36 -07001245
Tejun Heo454000a2017-05-15 09:34:02 -04001246 /*
1247 * If @cset should be threaded, look up the matching dom_cset and
1248 * link them up. We first fully initialize @cset then look for the
1249 * dom_cset. It's simpler this way and safe as @cset is guaranteed
1250 * to stay empty until we return.
1251 */
1252 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1253 struct css_set *dcset;
1254
1255 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1256 if (!dcset) {
1257 put_css_set(cset);
1258 return NULL;
1259 }
1260
1261 spin_lock_irq(&css_set_lock);
1262 cset->dom_cset = dcset;
1263 list_add_tail(&cset->threaded_csets_node,
1264 &dcset->threaded_csets);
1265 spin_unlock_irq(&css_set_lock);
1266 }
1267
Tejun Heo5abb8852013-06-12 21:04:49 -07001268 return cset;
Paul Menageb4f48b62007-10-18 23:39:33 -07001269}
1270
Tejun Heo0a268db2016-12-27 14:49:06 -05001271struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
Paul Menage7717f7b2009-09-23 15:56:22 -07001272{
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001273 struct cgroup *root_cgrp = kf_root->kn->priv;
Tejun Heo2bd59d42014-02-11 11:52:49 -05001274
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001275 return root_cgrp->root;
Tejun Heo2bd59d42014-02-11 11:52:49 -05001276}
1277
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001278static int cgroup_init_root_id(struct cgroup_root *root)
Tejun Heof2e85d52014-02-11 11:52:49 -05001279{
1280 int id;
1281
1282 lockdep_assert_held(&cgroup_mutex);
1283
Tejun Heo985ed672014-03-19 10:23:53 -04001284 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
Tejun Heof2e85d52014-02-11 11:52:49 -05001285 if (id < 0)
1286 return id;
1287
1288 root->hierarchy_id = id;
1289 return 0;
1290}
1291
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001292static void cgroup_exit_root_id(struct cgroup_root *root)
Tejun Heof2e85d52014-02-11 11:52:49 -05001293{
1294 lockdep_assert_held(&cgroup_mutex);
1295
Johannes Weiner8c8a5502016-06-17 12:23:59 -04001296 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
Tejun Heof2e85d52014-02-11 11:52:49 -05001297}
1298
Tejun Heo1592c9b2016-12-27 14:49:08 -05001299void cgroup_free_root(struct cgroup_root *root)
Tejun Heof2e85d52014-02-11 11:52:49 -05001300{
Tejun Heo74321032019-11-04 15:54:30 -08001301 kfree(root);
Tejun Heof2e85d52014-02-11 11:52:49 -05001302}
1303
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001304static void cgroup_destroy_root(struct cgroup_root *root)
Tejun Heo59f52962014-02-11 11:52:49 -05001305{
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001306 struct cgroup *cgrp = &root->cgrp;
Tejun Heof2e85d52014-02-11 11:52:49 -05001307 struct cgrp_cset_link *link, *tmp_link;
Tejun Heof2e85d52014-02-11 11:52:49 -05001308
Tejun Heoed1777d2016-08-10 11:23:44 -04001309 trace_cgroup_destroy_root(root);
1310
Tejun Heo334c3672016-03-03 09:58:01 -05001311 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
Tejun Heof2e85d52014-02-11 11:52:49 -05001312
Tejun Heo776f02f2014-02-12 09:29:50 -05001313 BUG_ON(atomic_read(&root->nr_cgrps));
Tejun Heod5c419b2014-05-16 13:22:48 -04001314 BUG_ON(!list_empty(&cgrp->self.children));
Tejun Heof2e85d52014-02-11 11:52:49 -05001315
Tejun Heof2e85d52014-02-11 11:52:49 -05001316 /* Rebind all subsystems back to the default hierarchy */
Tejun Heo334c3672016-03-03 09:58:01 -05001317 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
Tejun Heof2e85d52014-02-11 11:52:49 -05001318
1319 /*
1320 * Release all the links from cset_links to this hierarchy's
1321 * root cgroup
1322 */
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001323 spin_lock_irq(&css_set_lock);
Tejun Heof2e85d52014-02-11 11:52:49 -05001324
1325 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1326 list_del(&link->cset_link);
1327 list_del(&link->cgrp_link);
1328 kfree(link);
1329 }
Tejun Heof0d9a5f2015-10-15 16:41:53 -04001330
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001331 spin_unlock_irq(&css_set_lock);
Tejun Heof2e85d52014-02-11 11:52:49 -05001332
1333 if (!list_empty(&root->root_list)) {
1334 list_del(&root->root_list);
1335 cgroup_root_count--;
1336 }
1337
1338 cgroup_exit_root_id(root);
1339
1340 mutex_unlock(&cgroup_mutex);
Tejun Heof2e85d52014-02-11 11:52:49 -05001341
Johannes Weinera7df69b2021-04-29 22:56:20 -07001342 cgroup_rstat_exit(cgrp);
Tejun Heo2bd59d42014-02-11 11:52:49 -05001343 kernfs_destroy_root(root->kf_root);
Tejun Heof2e85d52014-02-11 11:52:49 -05001344 cgroup_free_root(root);
1345}
1346
Serge E. Hallyn4f41fc52016-05-09 09:59:55 -05001347/*
1348 * look up cgroup associated with current task's cgroup namespace on the
1349 * specified hierarchy
1350 */
1351static struct cgroup *
1352current_cgns_cgroup_from_root(struct cgroup_root *root)
1353{
1354 struct cgroup *res = NULL;
1355 struct css_set *cset;
1356
1357 lockdep_assert_held(&css_set_lock);
1358
1359 rcu_read_lock();
1360
1361 cset = current->nsproxy->cgroup_ns->root_cset;
1362 if (cset == &init_css_set) {
1363 res = &root->cgrp;
Miaohe Lin61e867f2019-09-29 16:06:58 +08001364 } else if (root == &cgrp_dfl_root) {
1365 res = cset->dfl_cgrp;
Serge E. Hallyn4f41fc52016-05-09 09:59:55 -05001366 } else {
1367 struct cgrp_cset_link *link;
1368
1369 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1370 struct cgroup *c = link->cgrp;
1371
1372 if (c->root == root) {
1373 res = c;
1374 break;
1375 }
1376 }
1377 }
1378 rcu_read_unlock();
1379
1380 BUG_ON(!res);
1381 return res;
1382}
1383
Tejun Heoceb6a082014-02-25 10:04:02 -05001384/* look up cgroup associated with given css_set on the specified hierarchy */
1385static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001386 struct cgroup_root *root)
Paul Menage7717f7b2009-09-23 15:56:22 -07001387{
Paul Menage7717f7b2009-09-23 15:56:22 -07001388 struct cgroup *res = NULL;
1389
Tejun Heo96d365e2014-02-13 06:58:40 -05001390 lockdep_assert_held(&cgroup_mutex);
Tejun Heof0d9a5f2015-10-15 16:41:53 -04001391 lockdep_assert_held(&css_set_lock);
Tejun Heo96d365e2014-02-13 06:58:40 -05001392
Tejun Heo5abb8852013-06-12 21:04:49 -07001393 if (cset == &init_css_set) {
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001394 res = &root->cgrp;
Tejun Heo13d82fb2017-08-02 15:39:38 -07001395 } else if (root == &cgrp_dfl_root) {
1396 res = cset->dfl_cgrp;
Paul Menage7717f7b2009-09-23 15:56:22 -07001397 } else {
Tejun Heo69d02062013-06-12 21:04:50 -07001398 struct cgrp_cset_link *link;
1399
1400 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
Paul Menage7717f7b2009-09-23 15:56:22 -07001401 struct cgroup *c = link->cgrp;
Tejun Heo69d02062013-06-12 21:04:50 -07001402
Paul Menage7717f7b2009-09-23 15:56:22 -07001403 if (c->root == root) {
1404 res = c;
1405 break;
1406 }
1407 }
1408 }
Tejun Heo96d365e2014-02-13 06:58:40 -05001409
Paul Menage7717f7b2009-09-23 15:56:22 -07001410 BUG_ON(!res);
1411 return res;
1412}
1413
1414/*
Tejun Heoceb6a082014-02-25 10:04:02 -05001415 * Return the cgroup for "task" from the given hierarchy. Must be
Tejun Heof0d9a5f2015-10-15 16:41:53 -04001416 * called with cgroup_mutex and css_set_lock held.
Tejun Heoceb6a082014-02-25 10:04:02 -05001417 */
Tejun Heo0a268db2016-12-27 14:49:06 -05001418struct cgroup *task_cgroup_from_root(struct task_struct *task,
1419 struct cgroup_root *root)
Tejun Heoceb6a082014-02-25 10:04:02 -05001420{
1421 /*
Michal Koutnýe7c7b1d2019-10-04 12:57:39 +02001422 * No need to lock the task - since we hold css_set_lock the
1423 * task can't change groups.
Tejun Heoceb6a082014-02-25 10:04:02 -05001424 */
1425 return cset_cgroup_from_root(task_css_set(task), root);
1426}
1427
1428/*
Paul Menageddbcc7e2007-10-18 23:39:30 -07001429 * A task must hold cgroup_mutex to modify cgroups.
1430 *
1431 * Any task can increment and decrement the count field without lock.
1432 * So in general, code holding cgroup_mutex can't rely on the count
1433 * field not changing. However, if the count goes to zero, then only
Cliff Wickman956db3c2008-02-07 00:14:43 -08001434 * cgroup_attach_task() can increment it again. Because a count of zero
Paul Menageddbcc7e2007-10-18 23:39:30 -07001435 * means that no tasks are currently attached, therefore there is no
1436 * way a task attached to that cgroup can fork (the other way to
1437 * increment the count). So code holding cgroup_mutex can safely
1438 * assume that if the count is zero, it will stay zero. Similarly, if
1439 * a task holds cgroup_mutex on a cgroup with zero count, it
1440 * knows that the cgroup won't be removed, as cgroup_rmdir()
1441 * needs that mutex.
1442 *
Paul Menageddbcc7e2007-10-18 23:39:30 -07001443 * A cgroup can only be deleted if both its 'count' of using tasks
1444 * is zero, and its list of 'children' cgroups is empty. Since all
1445 * tasks in the system use _some_ cgroup, and since there is always at
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001446 * least one task in the system (init, pid == 1), therefore, root cgroup
Paul Menageddbcc7e2007-10-18 23:39:30 -07001447 * always has either children cgroups and/or using tasks. So we don't
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001448 * need a special hack to ensure that root cgroup cannot be deleted.
Paul Menageddbcc7e2007-10-18 23:39:30 -07001449 *
1450 * P.S. One more locking exception. RCU is used to guard the
Cliff Wickman956db3c2008-02-07 00:14:43 -08001451 * update of a tasks cgroup pointer by cgroup_attach_task()
Paul Menageddbcc7e2007-10-18 23:39:30 -07001452 */
1453
Tejun Heo2bd59d42014-02-11 11:52:49 -05001454static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
Paul Menagea4243162007-10-18 23:39:35 -07001455
Jens Axboecf892982019-06-10 03:35:41 -06001456static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1457 char *buf)
Paul Menageddbcc7e2007-10-18 23:39:30 -07001458{
Tejun Heo3e1d2ee2015-08-18 13:58:16 -07001459 struct cgroup_subsys *ss = cft->ss;
1460
Tejun Heo8d7e6fb2014-02-11 11:52:48 -05001461 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
Tejun Heoc1bbd932018-11-13 12:06:41 -08001462 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1463 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1464
1465 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1466 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
Jens Axboecf892982019-06-10 03:35:41 -06001467 cft->name);
Tejun Heoc1bbd932018-11-13 12:06:41 -08001468 } else {
Jens Axboecf892982019-06-10 03:35:41 -06001469 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
Tejun Heoc1bbd932018-11-13 12:06:41 -08001470 }
Tejun Heo8d7e6fb2014-02-11 11:52:48 -05001471 return buf;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001472}
1473
Tejun Heof2e85d52014-02-11 11:52:49 -05001474/**
1475 * cgroup_file_mode - deduce file mode of a control file
1476 * @cft: the control file in question
1477 *
Tejun Heo7dbdb192015-09-18 17:54:23 -04001478 * S_IRUGO for read, S_IWUSR for write.
Tejun Heof2e85d52014-02-11 11:52:49 -05001479 */
1480static umode_t cgroup_file_mode(const struct cftype *cft)
Li Zefan65dff752013-03-01 15:01:56 +08001481{
Tejun Heof2e85d52014-02-11 11:52:49 -05001482 umode_t mode = 0;
Li Zefan65dff752013-03-01 15:01:56 +08001483
Tejun Heof2e85d52014-02-11 11:52:49 -05001484 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1485 mode |= S_IRUGO;
1486
Tejun Heo7dbdb192015-09-18 17:54:23 -04001487 if (cft->write_u64 || cft->write_s64 || cft->write) {
1488 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1489 mode |= S_IWUGO;
1490 else
1491 mode |= S_IWUSR;
1492 }
Tejun Heof2e85d52014-02-11 11:52:49 -05001493
1494 return mode;
Li Zefan65dff752013-03-01 15:01:56 +08001495}
1496
Tejun Heoa9746d82014-05-13 12:19:22 -04001497/**
Tejun Heo8699b772016-02-22 22:25:46 -05001498 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
Tejun Heo0f060de2014-11-18 02:49:50 -05001499 * @subtree_control: the new subtree_control mask to consider
Tejun Heo5ced2512016-03-03 09:58:01 -05001500 * @this_ss_mask: available subsystems
Tejun Heoaf0ba672014-07-08 18:02:57 -04001501 *
1502 * On the default hierarchy, a subsystem may request other subsystems to be
1503 * enabled together through its ->depends_on mask. In such cases, more
1504 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1505 *
Tejun Heo0f060de2014-11-18 02:49:50 -05001506 * This function calculates which subsystems need to be enabled if
Tejun Heo5ced2512016-03-03 09:58:01 -05001507 * @subtree_control is to be applied while restricted to @this_ss_mask.
Tejun Heoaf0ba672014-07-08 18:02:57 -04001508 */
Tejun Heo5ced2512016-03-03 09:58:01 -05001509static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
Tejun Heo667c2492014-07-08 18:02:56 -04001510{
Tejun Heo6e5c8302016-02-22 22:25:47 -05001511 u16 cur_ss_mask = subtree_control;
Tejun Heoaf0ba672014-07-08 18:02:57 -04001512 struct cgroup_subsys *ss;
1513 int ssid;
1514
1515 lockdep_assert_held(&cgroup_mutex);
1516
Tejun Heof6d635ad2016-03-08 11:51:26 -05001517 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1518
Tejun Heoaf0ba672014-07-08 18:02:57 -04001519 while (true) {
Tejun Heo6e5c8302016-02-22 22:25:47 -05001520 u16 new_ss_mask = cur_ss_mask;
Tejun Heoaf0ba672014-07-08 18:02:57 -04001521
Tejun Heob4e0eea2016-02-22 22:25:46 -05001522 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
Aleksa Saraia966a4e2015-06-06 10:02:15 +10001523 new_ss_mask |= ss->depends_on;
Tejun Heob4e0eea2016-02-22 22:25:46 -05001524 } while_each_subsys_mask();
Tejun Heoaf0ba672014-07-08 18:02:57 -04001525
1526 /*
1527 * Mask out subsystems which aren't available. This can
1528 * happen only if some depended-upon subsystems were bound
1529 * to non-default hierarchies.
1530 */
Tejun Heo5ced2512016-03-03 09:58:01 -05001531 new_ss_mask &= this_ss_mask;
Tejun Heoaf0ba672014-07-08 18:02:57 -04001532
1533 if (new_ss_mask == cur_ss_mask)
1534 break;
1535 cur_ss_mask = new_ss_mask;
1536 }
1537
Tejun Heo0f060de2014-11-18 02:49:50 -05001538 return cur_ss_mask;
1539}
1540
1541/**
Tejun Heoa9746d82014-05-13 12:19:22 -04001542 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1543 * @kn: the kernfs_node being serviced
1544 *
1545 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1546 * the method finishes if locking succeeded. Note that once this function
1547 * returns the cgroup returned by cgroup_kn_lock_live() may become
1548 * inaccessible any time. If the caller intends to continue to access the
1549 * cgroup, it should pin it before invoking this function.
1550 */
Tejun Heo0a268db2016-12-27 14:49:06 -05001551void cgroup_kn_unlock(struct kernfs_node *kn)
Tejun Heoa9746d82014-05-13 12:19:22 -04001552{
1553 struct cgroup *cgrp;
1554
1555 if (kernfs_type(kn) == KERNFS_DIR)
1556 cgrp = kn->priv;
1557 else
1558 cgrp = kn->parent->priv;
1559
1560 mutex_unlock(&cgroup_mutex);
Tejun Heoa9746d82014-05-13 12:19:22 -04001561
1562 kernfs_unbreak_active_protection(kn);
1563 cgroup_put(cgrp);
1564}
1565
1566/**
1567 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1568 * @kn: the kernfs_node being serviced
Tejun Heo945ba192016-03-03 09:58:00 -05001569 * @drain_offline: perform offline draining on the cgroup
Tejun Heoa9746d82014-05-13 12:19:22 -04001570 *
1571 * This helper is to be used by a cgroup kernfs method currently servicing
1572 * @kn. It breaks the active protection, performs cgroup locking and
1573 * verifies that the associated cgroup is alive. Returns the cgroup if
1574 * alive; otherwise, %NULL. A successful return should be undone by a
Tejun Heo945ba192016-03-03 09:58:00 -05001575 * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
1576 * cgroup is drained of offlining csses before return.
Tejun Heoa9746d82014-05-13 12:19:22 -04001577 *
1578 * Any cgroup kernfs method implementation which requires locking the
1579 * associated cgroup should use this helper. It avoids nesting cgroup
1580 * locking under kernfs active protection and allows all kernfs operations
1581 * including self-removal.
1582 */
Tejun Heo0a268db2016-12-27 14:49:06 -05001583struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
Tejun Heoa9746d82014-05-13 12:19:22 -04001584{
1585 struct cgroup *cgrp;
1586
1587 if (kernfs_type(kn) == KERNFS_DIR)
1588 cgrp = kn->priv;
1589 else
1590 cgrp = kn->parent->priv;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001591
Tejun Heo2bd59d42014-02-11 11:52:49 -05001592 /*
Tejun Heo01f64742014-05-13 12:19:23 -04001593 * We're gonna grab cgroup_mutex which nests outside kernfs
Tejun Heoa9746d82014-05-13 12:19:22 -04001594 * active_ref. cgroup liveliness check alone provides enough
1595 * protection against removal. Ensure @cgrp stays accessible and
1596 * break the active_ref protection.
Tejun Heo2bd59d42014-02-11 11:52:49 -05001597 */
Li Zefanaa323622014-09-04 14:43:38 +08001598 if (!cgroup_tryget(cgrp))
1599 return NULL;
Tejun Heoa9746d82014-05-13 12:19:22 -04001600 kernfs_break_active_protection(kn);
Paul Menageddbcc7e2007-10-18 23:39:30 -07001601
Tejun Heo945ba192016-03-03 09:58:00 -05001602 if (drain_offline)
1603 cgroup_lock_and_drain_offline(cgrp);
1604 else
1605 mutex_lock(&cgroup_mutex);
Tejun Heoa9746d82014-05-13 12:19:22 -04001606
1607 if (!cgroup_is_dead(cgrp))
1608 return cgrp;
1609
1610 cgroup_kn_unlock(kn);
1611 return NULL;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001612}
1613
Li Zefan2739d3c2013-01-21 18:18:33 +08001614static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
Paul Menageddbcc7e2007-10-18 23:39:30 -07001615{
Tejun Heo2bd59d42014-02-11 11:52:49 -05001616 char name[CGROUP_FILE_NAME_MAX];
Paul Menageddbcc7e2007-10-18 23:39:30 -07001617
Tejun Heo01f64742014-05-13 12:19:23 -04001618 lockdep_assert_held(&cgroup_mutex);
Tejun Heo34c06252015-11-05 00:12:24 -05001619
1620 if (cft->file_offset) {
1621 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1622 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1623
1624 spin_lock_irq(&cgroup_file_kn_lock);
1625 cfile->kn = NULL;
1626 spin_unlock_irq(&cgroup_file_kn_lock);
Tejun Heob12e3582018-04-26 14:29:04 -07001627
1628 del_timer_sync(&cfile->notify_timer);
Tejun Heo34c06252015-11-05 00:12:24 -05001629 }
1630
Tejun Heo2bd59d42014-02-11 11:52:49 -05001631 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
Tejun Heo05ef1d72012-04-01 12:09:56 -07001632}
1633
Aristeu Rozanski13af07d2012-08-23 16:53:29 -04001634/**
Tejun Heo4df8dc92015-09-18 17:54:23 -04001635 * css_clear_dir - remove subsys files in a cgroup directory
1636 * @css: taget css
Aristeu Rozanski13af07d2012-08-23 16:53:29 -04001637 */
Tejun Heo334c3672016-03-03 09:58:01 -05001638static void css_clear_dir(struct cgroup_subsys_state *css)
Tejun Heo05ef1d72012-04-01 12:09:56 -07001639{
Tejun Heo334c3672016-03-03 09:58:01 -05001640 struct cgroup *cgrp = css->cgroup;
Tejun Heo4df8dc92015-09-18 17:54:23 -04001641 struct cftype *cfts;
Tejun Heo05ef1d72012-04-01 12:09:56 -07001642
Tejun Heo88cb04b2016-03-03 09:57:58 -05001643 if (!(css->flags & CSS_VISIBLE))
1644 return;
1645
1646 css->flags &= ~CSS_VISIBLE;
1647
Tejun Heo5faaf052018-04-26 14:29:04 -07001648 if (!css->ss) {
1649 if (cgroup_on_dfl(cgrp))
1650 cfts = cgroup_base_files;
1651 else
1652 cfts = cgroup1_base_files;
1653
Tejun Heo4df8dc92015-09-18 17:54:23 -04001654 cgroup_addrm_files(css, cgrp, cfts, false);
Tejun Heo5faaf052018-04-26 14:29:04 -07001655 } else {
1656 list_for_each_entry(cfts, &css->ss->cfts, node)
1657 cgroup_addrm_files(css, cgrp, cfts, false);
1658 }
Paul Menageddbcc7e2007-10-18 23:39:30 -07001659}
1660
Tejun Heoccdca212015-09-18 17:54:23 -04001661/**
Tejun Heo4df8dc92015-09-18 17:54:23 -04001662 * css_populate_dir - create subsys files in a cgroup directory
1663 * @css: target css
Tejun Heoccdca212015-09-18 17:54:23 -04001664 *
1665 * On failure, no file is added.
1666 */
Tejun Heo334c3672016-03-03 09:58:01 -05001667static int css_populate_dir(struct cgroup_subsys_state *css)
Tejun Heoccdca212015-09-18 17:54:23 -04001668{
Tejun Heo334c3672016-03-03 09:58:01 -05001669 struct cgroup *cgrp = css->cgroup;
Tejun Heo4df8dc92015-09-18 17:54:23 -04001670 struct cftype *cfts, *failed_cfts;
1671 int ret;
Tejun Heoccdca212015-09-18 17:54:23 -04001672
Tejun Heo03970d32016-03-03 09:58:00 -05001673 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
Tejun Heo88cb04b2016-03-03 09:57:58 -05001674 return 0;
1675
Tejun Heo4df8dc92015-09-18 17:54:23 -04001676 if (!css->ss) {
1677 if (cgroup_on_dfl(cgrp))
Tejun Heod62beb72016-12-27 14:49:08 -05001678 cfts = cgroup_base_files;
Tejun Heo4df8dc92015-09-18 17:54:23 -04001679 else
Tejun Heod62beb72016-12-27 14:49:08 -05001680 cfts = cgroup1_base_files;
Tejun Heoccdca212015-09-18 17:54:23 -04001681
Tejun Heo5faaf052018-04-26 14:29:04 -07001682 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1683 if (ret < 0)
1684 return ret;
1685 } else {
1686 list_for_each_entry(cfts, &css->ss->cfts, node) {
1687 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1688 if (ret < 0) {
1689 failed_cfts = cfts;
1690 goto err;
1691 }
Tejun Heoccdca212015-09-18 17:54:23 -04001692 }
1693 }
Tejun Heo88cb04b2016-03-03 09:57:58 -05001694
1695 css->flags |= CSS_VISIBLE;
1696
Tejun Heoccdca212015-09-18 17:54:23 -04001697 return 0;
1698err:
Tejun Heo4df8dc92015-09-18 17:54:23 -04001699 list_for_each_entry(cfts, &css->ss->cfts, node) {
1700 if (cfts == failed_cfts)
1701 break;
1702 cgroup_addrm_files(css, cgrp, cfts, false);
1703 }
Tejun Heoccdca212015-09-18 17:54:23 -04001704 return ret;
1705}
1706
Tejun Heo0a268db2016-12-27 14:49:06 -05001707int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
Paul Menageddbcc7e2007-10-18 23:39:30 -07001708{
Tejun Heo1ada4832015-09-18 17:54:23 -04001709 struct cgroup *dcgrp = &dst_root->cgrp;
Tejun Heo30159ec2013-06-25 11:53:37 -07001710 struct cgroup_subsys *ss;
Tejun Heo2d8f2432014-04-23 11:13:15 -04001711 int ssid, i, ret;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001712
Tejun Heoace2bee2014-02-11 11:52:47 -05001713 lockdep_assert_held(&cgroup_mutex);
Ben Blumaae8aab2010-03-10 15:22:07 -08001714
Tejun Heob4e0eea2016-02-22 22:25:46 -05001715 do_each_subsys_mask(ss, ssid, ss_mask) {
Tejun Heof6d635ad2016-03-08 11:51:26 -05001716 /*
1717 * If @ss has non-root csses attached to it, can't move.
1718 * If @ss is an implicit controller, it is exempt from this
1719 * rule and can be stolen.
1720 */
1721 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1722 !ss->implicit_on_dfl)
Tejun Heo3ed80a62014-02-08 10:36:58 -05001723 return -EBUSY;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001724
Tejun Heo5df36032014-03-19 10:23:54 -04001725 /* can't move between two non-dummy roots either */
Tejun Heo7fd8c562014-04-23 11:13:16 -04001726 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
Tejun Heo5df36032014-03-19 10:23:54 -04001727 return -EBUSY;
Tejun Heob4e0eea2016-02-22 22:25:46 -05001728 } while_each_subsys_mask();
Paul Menageddbcc7e2007-10-18 23:39:30 -07001729
Tejun Heob4e0eea2016-02-22 22:25:46 -05001730 do_each_subsys_mask(ss, ssid, ss_mask) {
Tejun Heo1ada4832015-09-18 17:54:23 -04001731 struct cgroup_root *src_root = ss->root;
1732 struct cgroup *scgrp = &src_root->cgrp;
1733 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
Tejun Heo2d8f2432014-04-23 11:13:15 -04001734 struct css_set *cset;
Tejun Heo30159ec2013-06-25 11:53:37 -07001735
Tejun Heo1ada4832015-09-18 17:54:23 -04001736 WARN_ON(!css || cgroup_css(dcgrp, ss));
Tejun Heo73e80ed2013-08-13 11:01:55 -04001737
Tejun Heo334c3672016-03-03 09:58:01 -05001738 /* disable from the source */
1739 src_root->subsys_mask &= ~(1 << ssid);
1740 WARN_ON(cgroup_apply_control(scgrp));
1741 cgroup_finalize_control(scgrp, 0);
Tejun Heo4df8dc92015-09-18 17:54:23 -04001742
Tejun Heo334c3672016-03-03 09:58:01 -05001743 /* rebind */
Tejun Heo1ada4832015-09-18 17:54:23 -04001744 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1745 rcu_assign_pointer(dcgrp->subsys[ssid], css);
Tejun Heo5df36032014-03-19 10:23:54 -04001746 ss->root = dst_root;
Tejun Heo1ada4832015-09-18 17:54:23 -04001747 css->cgroup = dcgrp;
Tejun Heoa8a648c2013-06-24 15:21:47 -07001748
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001749 spin_lock_irq(&css_set_lock);
Tejun Heo2d8f2432014-04-23 11:13:15 -04001750 hash_for_each(css_set_table, i, cset, hlist)
1751 list_move_tail(&cset->e_cset_node[ss->id],
Tejun Heo1ada4832015-09-18 17:54:23 -04001752 &dcgrp->e_csets[ss->id]);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001753 spin_unlock_irq(&css_set_lock);
Tejun Heo2d8f2432014-04-23 11:13:15 -04001754
Johannes Weinera7df69b2021-04-29 22:56:20 -07001755 if (ss->css_rstat_flush) {
1756 list_del_rcu(&css->rstat_css_node);
1757 list_add_rcu(&css->rstat_css_node,
1758 &dcgrp->rstat_css_list);
1759 }
1760
Tejun Heobd53d612014-04-23 11:13:16 -04001761 /* default hierarchy doesn't enable controllers by default */
Tejun Heof392e512014-04-23 11:13:14 -04001762 dst_root->subsys_mask |= 1 << ssid;
Tejun Heo49d1dc42015-09-18 11:56:28 -04001763 if (dst_root == &cgrp_dfl_root) {
1764 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1765 } else {
Tejun Heo1ada4832015-09-18 17:54:23 -04001766 dcgrp->subtree_control |= 1 << ssid;
Tejun Heo49d1dc42015-09-18 11:56:28 -04001767 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
Tejun Heo667c2492014-07-08 18:02:56 -04001768 }
Tejun Heo73e80ed2013-08-13 11:01:55 -04001769
Tejun Heo334c3672016-03-03 09:58:01 -05001770 ret = cgroup_apply_control(dcgrp);
1771 if (ret)
1772 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1773 ss->name, ret);
1774
Tejun Heo5df36032014-03-19 10:23:54 -04001775 if (ss->bind)
1776 ss->bind(css);
Tejun Heob4e0eea2016-02-22 22:25:46 -05001777 } while_each_subsys_mask();
Paul Menageddbcc7e2007-10-18 23:39:30 -07001778
Tejun Heo1ada4832015-09-18 17:54:23 -04001779 kernfs_activate(dcgrp->kn);
Paul Menageddbcc7e2007-10-18 23:39:30 -07001780 return 0;
1781}
1782
Tejun Heo1592c9b2016-12-27 14:49:08 -05001783int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1784 struct kernfs_root *kf_root)
Serge E. Hallyn4f41fc52016-05-09 09:59:55 -05001785{
Felipe Balbi09be4c82016-05-12 12:34:38 +03001786 int len = 0;
Serge E. Hallyn4f41fc52016-05-09 09:59:55 -05001787 char *buf = NULL;
1788 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1789 struct cgroup *ns_cgroup;
1790
1791 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1792 if (!buf)
1793 return -ENOMEM;
1794
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001795 spin_lock_irq(&css_set_lock);
Serge E. Hallyn4f41fc52016-05-09 09:59:55 -05001796 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1797 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03001798 spin_unlock_irq(&css_set_lock);
Serge E. Hallyn4f41fc52016-05-09 09:59:55 -05001799
1800 if (len >= PATH_MAX)
1801 len = -ERANGE;
1802 else if (len > 0) {
1803 seq_escape(sf, buf, " \t\n\\");
1804 len = 0;
1805 }
1806 kfree(buf);
1807 return len;
1808}
1809
Al Viroe34a98d2019-01-17 00:22:58 -05001810enum cgroup2_param {
1811 Opt_nsdelegate,
Chris Down9852ae32019-05-31 22:30:22 -07001812 Opt_memory_localevents,
Johannes Weiner8a931f82020-04-01 21:07:07 -07001813 Opt_memory_recursiveprot,
Al Viroe34a98d2019-01-17 00:22:58 -05001814 nr__cgroup2_params
1815};
1816
Al Virod7167b12019-09-07 07:23:15 -04001817static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
Chris Down9852ae32019-05-31 22:30:22 -07001818 fsparam_flag("nsdelegate", Opt_nsdelegate),
1819 fsparam_flag("memory_localevents", Opt_memory_localevents),
Johannes Weiner8a931f82020-04-01 21:07:07 -07001820 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
Al Viroe34a98d2019-01-17 00:22:58 -05001821 {}
1822};
1823
Al Viroe34a98d2019-01-17 00:22:58 -05001824static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
Tejun Heo5136f632017-06-27 14:30:28 -04001825{
Al Viroe34a98d2019-01-17 00:22:58 -05001826 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1827 struct fs_parse_result result;
1828 int opt;
Tejun Heo5136f632017-06-27 14:30:28 -04001829
Al Virod7167b12019-09-07 07:23:15 -04001830 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
Al Viroe34a98d2019-01-17 00:22:58 -05001831 if (opt < 0)
1832 return opt;
Tejun Heo5136f632017-06-27 14:30:28 -04001833
Al Viroe34a98d2019-01-17 00:22:58 -05001834 switch (opt) {
1835 case Opt_nsdelegate:
1836 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
Tejun Heo5136f632017-06-27 14:30:28 -04001837 return 0;
Chris Down9852ae32019-05-31 22:30:22 -07001838 case Opt_memory_localevents:
1839 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1840 return 0;
Johannes Weiner8a931f82020-04-01 21:07:07 -07001841 case Opt_memory_recursiveprot:
1842 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1843 return 0;
Tejun Heo5136f632017-06-27 14:30:28 -04001844 }
Al Viroe34a98d2019-01-17 00:22:58 -05001845 return -EINVAL;
Tejun Heo5136f632017-06-27 14:30:28 -04001846}
1847
1848static void apply_cgroup_root_flags(unsigned int root_flags)
1849{
1850 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1851 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1852 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1853 else
1854 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
Chris Down9852ae32019-05-31 22:30:22 -07001855
1856 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1857 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1858 else
1859 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
Johannes Weiner8a931f82020-04-01 21:07:07 -07001860
1861 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1862 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1863 else
1864 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
Tejun Heo5136f632017-06-27 14:30:28 -04001865 }
1866}
1867
1868static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1869{
1870 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1871 seq_puts(seq, ",nsdelegate");
Chris Down9852ae32019-05-31 22:30:22 -07001872 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1873 seq_puts(seq, ",memory_localevents");
Johannes Weiner8a931f82020-04-01 21:07:07 -07001874 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1875 seq_puts(seq, ",memory_recursiveprot");
Tejun Heo5136f632017-06-27 14:30:28 -04001876 return 0;
1877}
1878
Al Viro90129622019-01-05 00:38:03 -05001879static int cgroup_reconfigure(struct fs_context *fc)
Paul Menageddbcc7e2007-10-18 23:39:30 -07001880{
Al Viro90129622019-01-05 00:38:03 -05001881 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
Tejun Heo5136f632017-06-27 14:30:28 -04001882
Al Virof5dfb532019-01-16 23:42:38 -05001883 apply_cgroup_root_flags(ctx->flags);
Tejun Heo5136f632017-06-27 14:30:28 -04001884 return 0;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001885}
1886
Paul Menagecc31edc2008-10-18 20:28:04 -07001887static void init_cgroup_housekeeping(struct cgroup *cgrp)
1888{
Tejun Heo2d8f2432014-04-23 11:13:15 -04001889 struct cgroup_subsys *ss;
1890 int ssid;
1891
Tejun Heod5c419b2014-05-16 13:22:48 -04001892 INIT_LIST_HEAD(&cgrp->self.sibling);
1893 INIT_LIST_HEAD(&cgrp->self.children);
Tejun Heo69d02062013-06-12 21:04:50 -07001894 INIT_LIST_HEAD(&cgrp->cset_links);
Ben Blum72a8cb32009-09-23 15:56:27 -07001895 INIT_LIST_HEAD(&cgrp->pidlists);
1896 mutex_init(&cgrp->pidlist_mutex);
Tejun Heo9d800df2014-05-14 09:15:00 -04001897 cgrp->self.cgroup = cgrp;
Tejun Heo184faf32014-05-16 13:22:51 -04001898 cgrp->self.flags |= CSS_ONLINE;
Tejun Heo454000a2017-05-15 09:34:02 -04001899 cgrp->dom_cgrp = cgrp;
Roman Gushchin1a926e02017-07-28 18:28:44 +01001900 cgrp->max_descendants = INT_MAX;
1901 cgrp->max_depth = INT_MAX;
Tejun Heo8f534702018-04-26 14:29:05 -07001902 INIT_LIST_HEAD(&cgrp->rstat_css_list);
Tejun Heod4ff7492018-04-26 14:29:04 -07001903 prev_cputime_init(&cgrp->prev_cputime);
Tejun Heo2d8f2432014-04-23 11:13:15 -04001904
1905 for_each_subsys(ss, ssid)
1906 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
Tejun Heof8f22e52014-04-23 11:13:16 -04001907
1908 init_waitqueue_head(&cgrp->offline_waitq);
Tejun Heod62beb72016-12-27 14:49:08 -05001909 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
Paul Menagecc31edc2008-10-18 20:28:04 -07001910}
Paul Menagec6d57f32009-09-23 15:56:19 -07001911
Al Virocf6299b12019-01-17 02:25:51 -05001912void init_cgroup_root(struct cgroup_fs_context *ctx)
Paul Menageddbcc7e2007-10-18 23:39:30 -07001913{
Al Virocf6299b12019-01-17 02:25:51 -05001914 struct cgroup_root *root = ctx->root;
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001915 struct cgroup *cgrp = &root->cgrp;
Tejun Heob0ca5a82012-04-01 12:09:54 -07001916
Paul Menageddbcc7e2007-10-18 23:39:30 -07001917 INIT_LIST_HEAD(&root->root_list);
Tejun Heo3c9c8252014-02-12 09:29:50 -05001918 atomic_set(&root->nr_cgrps, 1);
Paul Menagebd89aab2007-10-18 23:40:44 -07001919 cgrp->root = root;
Paul Menagecc31edc2008-10-18 20:28:04 -07001920 init_cgroup_housekeeping(cgrp);
Paul Menageddbcc7e2007-10-18 23:39:30 -07001921
Al Virof5dfb532019-01-16 23:42:38 -05001922 root->flags = ctx->flags;
1923 if (ctx->release_agent)
1924 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1925 if (ctx->name)
1926 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1927 if (ctx->cpuset_clone_children)
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001928 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
Paul Menagec6d57f32009-09-23 15:56:19 -07001929}
1930
Al Viro35ac1182019-01-12 00:20:54 -05001931int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
Paul Menage2c6ab6d2009-09-23 15:56:23 -07001932{
Tejun Heod427dfe2014-02-11 11:52:48 -05001933 LIST_HEAD(tmp_links);
Tejun Heo3dd06ff2014-03-19 10:23:54 -04001934 struct cgroup *root_cgrp = &root->cgrp;
Tejun Heofa069902016-12-27 14:49:07 -05001935 struct kernfs_syscall_ops *kf_sops;
Tejun Heod427dfe2014-02-11 11:52:48 -05001936 struct css_set *cset;
Tejun Heod427dfe2014-02-11 11:52:48 -05001937 int i, ret;
Paul Menage2c6ab6d2009-09-23 15:56:23 -07001938
Tejun Heod427dfe2014-02-11 11:52:48 -05001939 lockdep_assert_held(&cgroup_mutex);
Paul Menage2c6ab6d2009-09-23 15:56:23 -07001940
Zefan Li9732adc2017-04-19 10:15:59 +08001941 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
Al Viro35ac1182019-01-12 00:20:54 -05001942 0, GFP_KERNEL);
Tejun Heo9d755d32014-05-14 09:15:02 -04001943 if (ret)
1944 goto out;
1945
Tejun Heod427dfe2014-02-11 11:52:48 -05001946 /*
Tejun Heof0d9a5f2015-10-15 16:41:53 -04001947 * We're accessing css_set_count without locking css_set_lock here,
Tejun Heod427dfe2014-02-11 11:52:48 -05001948 * but that's OK - it can only be increased by someone holding
Tejun Heo04313592016-03-03 09:58:01 -05001949 * cgroup_lock, and that's us. Later rebinding may disable
1950 * controllers on the default hierarchy and thus create new csets,
1951 * which can't be more than the existing ones. Allocate 2x.
Tejun Heod427dfe2014-02-11 11:52:48 -05001952 */
Tejun Heo04313592016-03-03 09:58:01 -05001953 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
Paul Menageddbcc7e2007-10-18 23:39:30 -07001954 if (ret)
Tejun Heo9d755d32014-05-14 09:15:02 -04001955 goto cancel_ref;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001956
Tejun Heo985ed672014-03-19 10:23:53 -04001957 ret = cgroup_init_root_id(root);
Tejun Heod427dfe2014-02-11 11:52:48 -05001958 if (ret)
Tejun Heo9d755d32014-05-14 09:15:02 -04001959 goto cancel_ref;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001960
Tejun Heofa069902016-12-27 14:49:07 -05001961 kf_sops = root == &cgrp_dfl_root ?
1962 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1963
1964 root->kf_root = kernfs_create_root(kf_sops,
Shaohua Liaa818822017-07-12 11:49:51 -07001965 KERNFS_ROOT_CREATE_DEACTIVATED |
Daniel Xu38aca302020-03-12 13:03:17 -07001966 KERNFS_ROOT_SUPPORT_EXPORTOP |
1967 KERNFS_ROOT_SUPPORT_USER_XATTR,
Tejun Heo2bd59d42014-02-11 11:52:49 -05001968 root_cgrp);
1969 if (IS_ERR(root->kf_root)) {
1970 ret = PTR_ERR(root->kf_root);
1971 goto exit_root_id;
1972 }
1973 root_cgrp->kn = root->kf_root->kn;
Tejun Heod7495342019-11-14 14:46:51 -08001974 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
Tejun Heo74321032019-11-04 15:54:30 -08001975 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
Paul Menageddbcc7e2007-10-18 23:39:30 -07001976
Tejun Heo334c3672016-03-03 09:58:01 -05001977 ret = css_populate_dir(&root_cgrp->self);
Tejun Heod427dfe2014-02-11 11:52:48 -05001978 if (ret)
Tejun Heo2bd59d42014-02-11 11:52:49 -05001979 goto destroy_root;
Paul Menageddbcc7e2007-10-18 23:39:30 -07001980
Johannes Weinera7df69b2021-04-29 22:56:20 -07001981 ret = cgroup_rstat_init(root_cgrp);
Tejun Heod427dfe2014-02-11 11:52:48 -05001982 if (ret)
Tejun Heo2bd59d42014-02-11 11:52:49 -05001983 goto destroy_root;
Al Viro0df6a632010-12-21 13:29:29 -05001984
Johannes Weinera7df69b2021-04-29 22:56:20 -07001985 ret = rebind_subsystems(root, ss_mask);
1986 if (ret)
1987 goto exit_stats;
1988
Alexei Starovoitov324bda9e62017-10-02 22:50:21 -07001989 ret = cgroup_bpf_inherit(root_cgrp);
1990 WARN_ON_ONCE(ret);
1991
Tejun Heoed1777d2016-08-10 11:23:44 -04001992 trace_cgroup_setup_root(root);
1993
Tejun Heod427dfe2014-02-11 11:52:48 -05001994 /*
1995 * There must be no failure case after here, since rebinding takes
1996 * care of subsystems' refcounts, which are explicitly dropped in
1997 * the failure exit path.
1998 */
1999 list_add(&root->root_list, &cgroup_roots);
2000 cgroup_root_count++;
Paul Menageddbcc7e2007-10-18 23:39:30 -07002001
Tejun Heod427dfe2014-02-11 11:52:48 -05002002 /*
Tejun Heo3dd06ff2014-03-19 10:23:54 -04002003 * Link the root cgroup in this hierarchy into all the css_set
Tejun Heod427dfe2014-02-11 11:52:48 -05002004 * objects.
2005 */
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002006 spin_lock_irq(&css_set_lock);
Tejun Heo0de09422015-10-15 16:41:49 -04002007 hash_for_each(css_set_table, i, cset, hlist) {
Tejun Heod427dfe2014-02-11 11:52:48 -05002008 link_css_set(&tmp_links, cset, root_cgrp);
Tejun Heo0de09422015-10-15 16:41:49 -04002009 if (css_set_populated(cset))
2010 cgroup_update_populated(root_cgrp, true);
2011 }
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002012 spin_unlock_irq(&css_set_lock);
Paul Menageddbcc7e2007-10-18 23:39:30 -07002013
Tejun Heod5c419b2014-05-16 13:22:48 -04002014 BUG_ON(!list_empty(&root_cgrp->self.children));
Tejun Heo3c9c8252014-02-12 09:29:50 -05002015 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
Tejun Heod427dfe2014-02-11 11:52:48 -05002016
2017 ret = 0;
Tejun Heo2bd59d42014-02-11 11:52:49 -05002018 goto out;
Tejun Heod427dfe2014-02-11 11:52:48 -05002019
Johannes Weinera7df69b2021-04-29 22:56:20 -07002020exit_stats:
2021 cgroup_rstat_exit(root_cgrp);
Tejun Heo2bd59d42014-02-11 11:52:49 -05002022destroy_root:
2023 kernfs_destroy_root(root->kf_root);
2024 root->kf_root = NULL;
2025exit_root_id:
Tejun Heod427dfe2014-02-11 11:52:48 -05002026 cgroup_exit_root_id(root);
Tejun Heo9d755d32014-05-14 09:15:02 -04002027cancel_ref:
Tejun Heo9a1049d2014-06-28 08:10:14 -04002028 percpu_ref_exit(&root_cgrp->self.refcnt);
Tejun Heo2bd59d42014-02-11 11:52:49 -05002029out:
Tejun Heod427dfe2014-02-11 11:52:48 -05002030 free_cgrp_cset_links(&tmp_links);
2031 return ret;
Paul Menageddbcc7e2007-10-18 23:39:30 -07002032}
2033
Al Virocca8f322019-01-17 10:14:26 -05002034int cgroup_do_get_tree(struct fs_context *fc)
Paul Menageddbcc7e2007-10-18 23:39:30 -07002035{
Al Viro71d883c2019-01-17 02:44:07 -05002036 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
David Howells23bf1b62018-11-01 23:07:26 +00002037 int ret;
Paul Menagec6d57f32009-09-23 15:56:19 -07002038
David Howells23bf1b62018-11-01 23:07:26 +00002039 ctx->kfc.root = ctx->root->kf_root;
Al Virocca8f322019-01-17 10:14:26 -05002040 if (fc->fs_type == &cgroup2_fs_type)
David Howells23bf1b62018-11-01 23:07:26 +00002041 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
Al Virocca8f322019-01-17 10:14:26 -05002042 else
David Howells23bf1b62018-11-01 23:07:26 +00002043 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2044 ret = kernfs_get_tree(fc);
Serge Hallyned825712016-01-29 02:54:09 -06002045
Paul Menagec6d57f32009-09-23 15:56:19 -07002046 /*
Tejun Heo633feee32016-12-27 14:49:07 -05002047 * In non-init cgroup namespace, instead of root cgroup's dentry,
2048 * we return the dentry corresponding to the cgroupns->root_cgrp.
Serge Hallyned825712016-01-29 02:54:09 -06002049 */
Al Virocca8f322019-01-17 10:14:26 -05002050 if (!ret && ctx->ns != &init_cgroup_ns) {
Serge Hallyned825712016-01-29 02:54:09 -06002051 struct dentry *nsdentry;
Al Viro71d883c2019-01-17 02:44:07 -05002052 struct super_block *sb = fc->root->d_sb;
Serge Hallyned825712016-01-29 02:54:09 -06002053 struct cgroup *cgrp;
2054
2055 mutex_lock(&cgroup_mutex);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002056 spin_lock_irq(&css_set_lock);
Serge Hallyned825712016-01-29 02:54:09 -06002057
Al Virocca8f322019-01-17 10:14:26 -05002058 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
Serge Hallyned825712016-01-29 02:54:09 -06002059
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002060 spin_unlock_irq(&css_set_lock);
Serge Hallyned825712016-01-29 02:54:09 -06002061 mutex_unlock(&cgroup_mutex);
2062
Al Viro399504e2019-01-06 11:41:29 -05002063 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
Al Viro71d883c2019-01-17 02:44:07 -05002064 dput(fc->root);
Al Viro71d883c2019-01-17 02:44:07 -05002065 if (IS_ERR(nsdentry)) {
Al Viro399504e2019-01-06 11:41:29 -05002066 deactivate_locked_super(sb);
Al Viro630faf82019-11-10 11:53:27 -05002067 ret = PTR_ERR(nsdentry);
2068 nsdentry = NULL;
Al Viro71d883c2019-01-17 02:44:07 -05002069 }
Al Viro630faf82019-11-10 11:53:27 -05002070 fc->root = nsdentry;
Serge Hallyned825712016-01-29 02:54:09 -06002071 }
2072
David Howells23bf1b62018-11-01 23:07:26 +00002073 if (!ctx->kfc.new_sb_created)
Al Viro71d883c2019-01-17 02:44:07 -05002074 cgroup_put(&ctx->root->cgrp);
Li Zefan3a32bd72014-06-30 11:50:59 +08002075
Al Viro71d883c2019-01-17 02:44:07 -05002076 return ret;
Tejun Heo633feee32016-12-27 14:49:07 -05002077}
2078
Al Viro90129622019-01-05 00:38:03 -05002079/*
2080 * Destroy a cgroup filesystem context.
2081 */
2082static void cgroup_fs_context_free(struct fs_context *fc)
Tejun Heo633feee32016-12-27 14:49:07 -05002083{
Al Viro90129622019-01-05 00:38:03 -05002084 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2085
Al Virof5dfb532019-01-16 23:42:38 -05002086 kfree(ctx->name);
2087 kfree(ctx->release_agent);
Al Virocca8f322019-01-17 10:14:26 -05002088 put_cgroup_ns(ctx->ns);
David Howells23bf1b62018-11-01 23:07:26 +00002089 kernfs_free_fs_context(fc);
Al Viro90129622019-01-05 00:38:03 -05002090 kfree(ctx);
2091}
2092
Al Viro90129622019-01-05 00:38:03 -05002093static int cgroup_get_tree(struct fs_context *fc)
Tejun Heo633feee32016-12-27 14:49:07 -05002094{
Al Viro90129622019-01-05 00:38:03 -05002095 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
Tejun Heo5136f632017-06-27 14:30:28 -04002096 int ret;
Tejun Heo633feee32016-12-27 14:49:07 -05002097
Al Viro90129622019-01-05 00:38:03 -05002098 cgrp_dfl_visible = true;
2099 cgroup_get_live(&cgrp_dfl_root.cgrp);
Al Virocf6299b12019-01-17 02:25:51 -05002100 ctx->root = &cgrp_dfl_root;
Tejun Heo633feee32016-12-27 14:49:07 -05002101
Al Virocca8f322019-01-17 10:14:26 -05002102 ret = cgroup_do_get_tree(fc);
Al Viro71d883c2019-01-17 02:44:07 -05002103 if (!ret)
2104 apply_cgroup_root_flags(ctx->flags);
2105 return ret;
Al Viro90129622019-01-05 00:38:03 -05002106}
2107
Al Viro90129622019-01-05 00:38:03 -05002108static const struct fs_context_operations cgroup_fs_context_ops = {
2109 .free = cgroup_fs_context_free,
Al Viroe34a98d2019-01-17 00:22:58 -05002110 .parse_param = cgroup2_parse_param,
Al Viro90129622019-01-05 00:38:03 -05002111 .get_tree = cgroup_get_tree,
2112 .reconfigure = cgroup_reconfigure,
2113};
2114
2115static const struct fs_context_operations cgroup1_fs_context_ops = {
2116 .free = cgroup_fs_context_free,
Al Viro8d2451f2019-01-17 00:15:11 -05002117 .parse_param = cgroup1_parse_param,
Al Viro90129622019-01-05 00:38:03 -05002118 .get_tree = cgroup1_get_tree,
2119 .reconfigure = cgroup1_reconfigure,
2120};
2121
2122/*
David Howells23bf1b62018-11-01 23:07:26 +00002123 * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
2124 * we select the namespace we're going to use.
Al Viro90129622019-01-05 00:38:03 -05002125 */
2126static int cgroup_init_fs_context(struct fs_context *fc)
2127{
2128 struct cgroup_fs_context *ctx;
2129
2130 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2131 if (!ctx)
2132 return -ENOMEM;
Tejun Heo633feee32016-12-27 14:49:07 -05002133
Al Virocca8f322019-01-17 10:14:26 -05002134 ctx->ns = current->nsproxy->cgroup_ns;
2135 get_cgroup_ns(ctx->ns);
David Howells23bf1b62018-11-01 23:07:26 +00002136 fc->fs_private = &ctx->kfc;
Al Viro90129622019-01-05 00:38:03 -05002137 if (fc->fs_type == &cgroup2_fs_type)
2138 fc->ops = &cgroup_fs_context_ops;
2139 else
2140 fc->ops = &cgroup1_fs_context_ops;
Al Virof7a99452019-05-12 12:42:58 -04002141 put_user_ns(fc->user_ns);
David Howells23bf1b62018-11-01 23:07:26 +00002142 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2143 fc->global = true;
Al Viro90129622019-01-05 00:38:03 -05002144 return 0;
Paul Menageddbcc7e2007-10-18 23:39:30 -07002145}
2146
SeongJae Parkdd4b0a42014-01-18 16:56:47 +09002147static void cgroup_kill_sb(struct super_block *sb)
2148{
Tejun Heo2bd59d42014-02-11 11:52:49 -05002149 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
Tejun Heo3dd06ff2014-03-19 10:23:54 -04002150 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
Paul Menageddbcc7e2007-10-18 23:39:30 -07002151
Tejun Heo9d755d32014-05-14 09:15:02 -04002152 /*
Al Viro35ac1182019-01-12 00:20:54 -05002153 * If @root doesn't have any children, start killing it.
Tejun Heo9d755d32014-05-14 09:15:02 -04002154 * This prevents new mounts by disabling percpu_ref_tryget_live().
2155 * cgroup_mount() may wait for @root's release.
Li Zefan1f779fb2014-06-04 16:48:15 +08002156 *
2157 * And don't kill the default root.
Tejun Heo9d755d32014-05-14 09:15:02 -04002158 */
Al Viro35ac1182019-01-12 00:20:54 -05002159 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2160 !percpu_ref_is_dying(&root->cgrp.self.refcnt))
Tejun Heo9d755d32014-05-14 09:15:02 -04002161 percpu_ref_kill(&root->cgrp.self.refcnt);
Al Viro35ac1182019-01-12 00:20:54 -05002162 cgroup_put(&root->cgrp);
Tejun Heo2bd59d42014-02-11 11:52:49 -05002163 kernfs_kill_sb(sb);
Paul Menageddbcc7e2007-10-18 23:39:30 -07002164}
2165
Tejun Heo0a268db2016-12-27 14:49:06 -05002166struct file_system_type cgroup_fs_type = {
Al Viro8d2451f2019-01-17 00:15:11 -05002167 .name = "cgroup",
2168 .init_fs_context = cgroup_init_fs_context,
Al Virod7167b12019-09-07 07:23:15 -04002169 .parameters = cgroup1_fs_parameters,
Al Viro8d2451f2019-01-17 00:15:11 -05002170 .kill_sb = cgroup_kill_sb,
2171 .fs_flags = FS_USERNS_MOUNT,
Paul Menageddbcc7e2007-10-18 23:39:30 -07002172};
2173
Tejun Heo67e9c742015-11-16 11:13:34 -05002174static struct file_system_type cgroup2_fs_type = {
Al Viroe34a98d2019-01-17 00:22:58 -05002175 .name = "cgroup2",
2176 .init_fs_context = cgroup_init_fs_context,
Al Virod7167b12019-09-07 07:23:15 -04002177 .parameters = cgroup2_fs_parameters,
Al Viroe34a98d2019-01-17 00:22:58 -05002178 .kill_sb = cgroup_kill_sb,
2179 .fs_flags = FS_USERNS_MOUNT,
Tejun Heo67e9c742015-11-16 11:13:34 -05002180};
2181
Al Virod5f68d32019-05-13 12:33:22 -04002182#ifdef CONFIG_CPUSETS
2183static const struct fs_context_operations cpuset_fs_context_ops = {
2184 .get_tree = cgroup1_get_tree,
2185 .free = cgroup_fs_context_free,
2186};
2187
2188/*
2189 * This is ugly, but preserves the userspace API for existing cpuset
2190 * users. If someone tries to mount the "cpuset" filesystem, we
2191 * silently switch it to mount "cgroup" instead
2192 */
2193static int cpuset_init_fs_context(struct fs_context *fc)
2194{
2195 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2196 struct cgroup_fs_context *ctx;
2197 int err;
2198
2199 err = cgroup_init_fs_context(fc);
2200 if (err) {
2201 kfree(agent);
2202 return err;
2203 }
2204
2205 fc->ops = &cpuset_fs_context_ops;
2206
2207 ctx = cgroup_fc2context(fc);
2208 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2209 ctx->flags |= CGRP_ROOT_NOPREFIX;
2210 ctx->release_agent = agent;
2211
2212 get_filesystem(&cgroup_fs_type);
2213 put_filesystem(fc->fs_type);
2214 fc->fs_type = &cgroup_fs_type;
2215
2216 return 0;
2217}
2218
2219static struct file_system_type cpuset_fs_type = {
2220 .name = "cpuset",
2221 .init_fs_context = cpuset_init_fs_context,
2222 .fs_flags = FS_USERNS_MOUNT,
2223};
2224#endif
2225
Tejun Heo0a268db2016-12-27 14:49:06 -05002226int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2227 struct cgroup_namespace *ns)
Aditya Kalia79a9082016-01-29 02:54:06 -06002228{
2229 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
Aditya Kalia79a9082016-01-29 02:54:06 -06002230
Tejun Heo4c737b42016-08-10 11:23:44 -04002231 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
Aditya Kalia79a9082016-01-29 02:54:06 -06002232}
2233
Tejun Heo4c737b42016-08-10 11:23:44 -04002234int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2235 struct cgroup_namespace *ns)
Aditya Kalia79a9082016-01-29 02:54:06 -06002236{
Tejun Heo4c737b42016-08-10 11:23:44 -04002237 int ret;
Aditya Kalia79a9082016-01-29 02:54:06 -06002238
2239 mutex_lock(&cgroup_mutex);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002240 spin_lock_irq(&css_set_lock);
Aditya Kalia79a9082016-01-29 02:54:06 -06002241
2242 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2243
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002244 spin_unlock_irq(&css_set_lock);
Aditya Kalia79a9082016-01-29 02:54:06 -06002245 mutex_unlock(&cgroup_mutex);
2246
2247 return ret;
2248}
2249EXPORT_SYMBOL_GPL(cgroup_path_ns);
2250
Li Zefana043e3b2008-02-23 15:24:09 -08002251/**
Tejun Heo913ffdb2013-07-11 16:34:48 -07002252 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
Tejun Heo857a2be2013-04-14 20:50:08 -07002253 * @task: target task
Tejun Heo857a2be2013-04-14 20:50:08 -07002254 * @buf: the buffer to write the path into
2255 * @buflen: the length of the buffer
2256 *
Tejun Heo913ffdb2013-07-11 16:34:48 -07002257 * Determine @task's cgroup on the first (the one with the lowest non-zero
2258 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
2259 * function grabs cgroup_mutex and shouldn't be used inside locks used by
2260 * cgroup controller callbacks.
2261 *
Tejun Heoe61734c2014-02-12 09:29:50 -05002262 * Return value is the same as kernfs_path().
Tejun Heo857a2be2013-04-14 20:50:08 -07002263 */
Tejun Heo4c737b42016-08-10 11:23:44 -04002264int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
Tejun Heo857a2be2013-04-14 20:50:08 -07002265{
Tejun Heo3dd06ff2014-03-19 10:23:54 -04002266 struct cgroup_root *root;
Tejun Heo913ffdb2013-07-11 16:34:48 -07002267 struct cgroup *cgrp;
Tejun Heoe61734c2014-02-12 09:29:50 -05002268 int hierarchy_id = 1;
Tejun Heo4c737b42016-08-10 11:23:44 -04002269 int ret;
Tejun Heo857a2be2013-04-14 20:50:08 -07002270
2271 mutex_lock(&cgroup_mutex);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002272 spin_lock_irq(&css_set_lock);
Tejun Heo857a2be2013-04-14 20:50:08 -07002273
Tejun Heo913ffdb2013-07-11 16:34:48 -07002274 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2275
Tejun Heo857a2be2013-04-14 20:50:08 -07002276 if (root) {
2277 cgrp = task_cgroup_from_root(task, root);
Tejun Heo4c737b42016-08-10 11:23:44 -04002278 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
Tejun Heo913ffdb2013-07-11 16:34:48 -07002279 } else {
2280 /* if no hierarchy exists, everyone is in "/" */
Tejun Heo4c737b42016-08-10 11:23:44 -04002281 ret = strlcpy(buf, "/", buflen);
Tejun Heo857a2be2013-04-14 20:50:08 -07002282 }
2283
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002284 spin_unlock_irq(&css_set_lock);
Tejun Heo857a2be2013-04-14 20:50:08 -07002285 mutex_unlock(&cgroup_mutex);
Tejun Heo4c737b42016-08-10 11:23:44 -04002286 return ret;
Tejun Heo857a2be2013-04-14 20:50:08 -07002287}
Tejun Heo913ffdb2013-07-11 16:34:48 -07002288EXPORT_SYMBOL_GPL(task_cgroup_path);
Tejun Heo857a2be2013-04-14 20:50:08 -07002289
Tejun Heoadaae5d2015-09-11 15:00:21 -04002290/**
Tejun Heoe595cd72017-01-15 19:03:41 -05002291 * cgroup_migrate_add_task - add a migration target task to a migration context
Tejun Heoadaae5d2015-09-11 15:00:21 -04002292 * @task: target task
Tejun Heoe595cd72017-01-15 19:03:41 -05002293 * @mgctx: target migration context
Tejun Heoadaae5d2015-09-11 15:00:21 -04002294 *
Tejun Heoe595cd72017-01-15 19:03:41 -05002295 * Add @task, which is a migration target, to @mgctx->tset. This function
2296 * becomes noop if @task doesn't need to be migrated. @task's css_set
2297 * should have been added as a migration source and @task->cg_list will be
2298 * moved from the css_set's tasks list to mg_tasks one.
Tejun Heoadaae5d2015-09-11 15:00:21 -04002299 */
Tejun Heoe595cd72017-01-15 19:03:41 -05002300static void cgroup_migrate_add_task(struct task_struct *task,
2301 struct cgroup_mgctx *mgctx)
Tejun Heoadaae5d2015-09-11 15:00:21 -04002302{
2303 struct css_set *cset;
2304
Tejun Heof0d9a5f2015-10-15 16:41:53 -04002305 lockdep_assert_held(&css_set_lock);
Tejun Heoadaae5d2015-09-11 15:00:21 -04002306
2307 /* @task either already exited or can't exit until the end */
2308 if (task->flags & PF_EXITING)
2309 return;
2310
Tejun Heo5153faa2019-10-24 12:03:51 -07002311 /* cgroup_threadgroup_rwsem protects racing against forks */
2312 WARN_ON_ONCE(list_empty(&task->cg_list));
Tejun Heoadaae5d2015-09-11 15:00:21 -04002313
2314 cset = task_css_set(task);
2315 if (!cset->mg_src_cgrp)
2316 return;
2317
Tejun Heo61046722017-07-08 07:17:02 -04002318 mgctx->tset.nr_tasks++;
2319
Tejun Heoadaae5d2015-09-11 15:00:21 -04002320 list_move_tail(&task->cg_list, &cset->mg_tasks);
2321 if (list_empty(&cset->mg_node))
Tejun Heoe595cd72017-01-15 19:03:41 -05002322 list_add_tail(&cset->mg_node,
2323 &mgctx->tset.src_csets);
Tejun Heoadaae5d2015-09-11 15:00:21 -04002324 if (list_empty(&cset->mg_dst_cset->mg_node))
Tejun Heod8ebf512017-01-15 19:03:40 -05002325 list_add_tail(&cset->mg_dst_cset->mg_node,
Tejun Heoe595cd72017-01-15 19:03:41 -05002326 &mgctx->tset.dst_csets);
Tejun Heoadaae5d2015-09-11 15:00:21 -04002327}
2328
Tejun Heo2f7ee562011-12-12 18:12:21 -08002329/**
2330 * cgroup_taskset_first - reset taskset and return the first task
2331 * @tset: taskset of interest
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05002332 * @dst_cssp: output variable for the destination css
Tejun Heo2f7ee562011-12-12 18:12:21 -08002333 *
2334 * @tset iteration is initialized and the first task is returned.
2335 */
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05002336struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2337 struct cgroup_subsys_state **dst_cssp)
Tejun Heo2f7ee562011-12-12 18:12:21 -08002338{
Tejun Heob3dc0942014-02-25 10:04:01 -05002339 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2340 tset->cur_task = NULL;
2341
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05002342 return cgroup_taskset_next(tset, dst_cssp);
Tejun Heo2f7ee562011-12-12 18:12:21 -08002343}
Tejun Heo2f7ee562011-12-12 18:12:21 -08002344
2345/**
2346 * cgroup_taskset_next - iterate to the next task in taskset
2347 * @tset: taskset of interest
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05002348 * @dst_cssp: output variable for the destination css
Tejun Heo2f7ee562011-12-12 18:12:21 -08002349 *
2350 * Return the next task in @tset. Iteration must have been initialized
2351 * with cgroup_taskset_first().
2352 */
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05002353struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2354 struct cgroup_subsys_state **dst_cssp)
Tejun Heo2f7ee562011-12-12 18:12:21 -08002355{
Tejun Heob3dc0942014-02-25 10:04:01 -05002356 struct css_set *cset = tset->cur_cset;
2357 struct task_struct *task = tset->cur_task;
Tejun Heo2f7ee562011-12-12 18:12:21 -08002358
Tejun Heob3dc0942014-02-25 10:04:01 -05002359 while (&cset->mg_node != tset->csets) {
2360 if (!task)
2361 task = list_first_entry(&cset->mg_tasks,
2362 struct task_struct, cg_list);
2363 else
2364 task = list_next_entry(task, cg_list);
Tejun Heo2f7ee562011-12-12 18:12:21 -08002365
Tejun Heob3dc0942014-02-25 10:04:01 -05002366 if (&task->cg_list != &cset->mg_tasks) {
2367 tset->cur_cset = cset;
2368 tset->cur_task = task;
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05002369
2370 /*
2371 * This function may be called both before and
2372 * after cgroup_taskset_migrate(). The two cases
2373 * can be distinguished by looking at whether @cset
2374 * has its ->mg_dst_cset set.
2375 */
2376 if (cset->mg_dst_cset)
2377 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2378 else
2379 *dst_cssp = cset->subsys[tset->ssid];
2380
Tejun Heob3dc0942014-02-25 10:04:01 -05002381 return task;
2382 }
2383
2384 cset = list_next_entry(cset, mg_node);
2385 task = NULL;
2386 }
2387
2388 return NULL;
Tejun Heo2f7ee562011-12-12 18:12:21 -08002389}
Tejun Heo2f7ee562011-12-12 18:12:21 -08002390
2391/**
Tejun Heo37ff9f82016-03-08 11:51:26 -05002392 * cgroup_taskset_migrate - migrate a taskset
Tejun Heoe595cd72017-01-15 19:03:41 -05002393 * @mgctx: migration context
Tejun Heoadaae5d2015-09-11 15:00:21 -04002394 *
Tejun Heoe595cd72017-01-15 19:03:41 -05002395 * Migrate tasks in @mgctx as setup by migration preparation functions.
Tejun Heo37ff9f82016-03-08 11:51:26 -05002396 * This function fails iff one of the ->can_attach callbacks fails and
Tejun Heoe595cd72017-01-15 19:03:41 -05002397 * guarantees that either all or none of the tasks in @mgctx are migrated.
2398 * @mgctx is consumed regardless of success.
Tejun Heoadaae5d2015-09-11 15:00:21 -04002399 */
Tejun Heobfc2cf62017-01-15 19:03:41 -05002400static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
Tejun Heoadaae5d2015-09-11 15:00:21 -04002401{
Tejun Heoe595cd72017-01-15 19:03:41 -05002402 struct cgroup_taskset *tset = &mgctx->tset;
Tejun Heo37ff9f82016-03-08 11:51:26 -05002403 struct cgroup_subsys *ss;
Tejun Heoadaae5d2015-09-11 15:00:21 -04002404 struct task_struct *task, *tmp_task;
2405 struct css_set *cset, *tmp_cset;
Tejun Heo37ff9f82016-03-08 11:51:26 -05002406 int ssid, failed_ssid, ret;
Tejun Heoadaae5d2015-09-11 15:00:21 -04002407
Tejun Heoadaae5d2015-09-11 15:00:21 -04002408 /* check that we can legitimately attach to the cgroup */
Tejun Heo61046722017-07-08 07:17:02 -04002409 if (tset->nr_tasks) {
2410 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2411 if (ss->can_attach) {
2412 tset->ssid = ssid;
2413 ret = ss->can_attach(tset);
2414 if (ret) {
2415 failed_ssid = ssid;
2416 goto out_cancel_attach;
2417 }
Tejun Heoadaae5d2015-09-11 15:00:21 -04002418 }
Tejun Heo61046722017-07-08 07:17:02 -04002419 } while_each_subsys_mask();
2420 }
Tejun Heoadaae5d2015-09-11 15:00:21 -04002421
2422 /*
2423 * Now that we're guaranteed success, proceed to move all tasks to
2424 * the new cgroup. There are no failure cases after here, so this
2425 * is the commit point.
2426 */
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002427 spin_lock_irq(&css_set_lock);
Tejun Heoadaae5d2015-09-11 15:00:21 -04002428 list_for_each_entry(cset, &tset->src_csets, mg_node) {
Tejun Heof6d7d042015-10-15 16:41:52 -04002429 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2430 struct css_set *from_cset = task_css_set(task);
2431 struct css_set *to_cset = cset->mg_dst_cset;
2432
2433 get_css_set(to_cset);
Waiman Long73a72422017-06-13 17:18:01 -04002434 to_cset->nr_tasks++;
Tejun Heof6d7d042015-10-15 16:41:52 -04002435 css_set_move_task(task, from_cset, to_cset, true);
Waiman Long73a72422017-06-13 17:18:01 -04002436 from_cset->nr_tasks--;
Roman Gushchin76f969e2019-04-19 10:03:04 -07002437 /*
2438 * If the source or destination cgroup is frozen,
2439 * the task might require to change its state.
2440 */
2441 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2442 to_cset->dfl_cgrp);
2443 put_css_set_locked(from_cset);
2444
Tejun Heof6d7d042015-10-15 16:41:52 -04002445 }
Tejun Heoadaae5d2015-09-11 15:00:21 -04002446 }
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002447 spin_unlock_irq(&css_set_lock);
Tejun Heoadaae5d2015-09-11 15:00:21 -04002448
2449 /*
2450 * Migration is committed, all target tasks are now on dst_csets.
2451 * Nothing is sensitive to fork() after this point. Notify
2452 * controllers that migration is complete.
2453 */
2454 tset->csets = &tset->dst_csets;
2455
Tejun Heo61046722017-07-08 07:17:02 -04002456 if (tset->nr_tasks) {
2457 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2458 if (ss->attach) {
2459 tset->ssid = ssid;
2460 ss->attach(tset);
2461 }
2462 } while_each_subsys_mask();
2463 }
Tejun Heoadaae5d2015-09-11 15:00:21 -04002464
2465 ret = 0;
2466 goto out_release_tset;
2467
2468out_cancel_attach:
Tejun Heo61046722017-07-08 07:17:02 -04002469 if (tset->nr_tasks) {
2470 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2471 if (ssid == failed_ssid)
2472 break;
2473 if (ss->cancel_attach) {
2474 tset->ssid = ssid;
2475 ss->cancel_attach(tset);
2476 }
2477 } while_each_subsys_mask();
2478 }
Tejun Heoadaae5d2015-09-11 15:00:21 -04002479out_release_tset:
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002480 spin_lock_irq(&css_set_lock);
Tejun Heoadaae5d2015-09-11 15:00:21 -04002481 list_splice_init(&tset->dst_csets, &tset->src_csets);
2482 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2483 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2484 list_del_init(&cset->mg_node);
2485 }
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002486 spin_unlock_irq(&css_set_lock);
Waiman Longc4fa6c42017-09-21 09:54:13 -04002487
2488 /*
2489 * Re-initialize the cgroup_taskset structure in case it is reused
2490 * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
2491 * iteration.
2492 */
2493 tset->nr_tasks = 0;
2494 tset->csets = &tset->src_csets;
Tejun Heoadaae5d2015-09-11 15:00:21 -04002495 return ret;
2496}
2497
2498/**
Tejun Heo8cfd8142017-07-21 11:14:51 -04002499 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
Tejun Heo6c694c82016-03-08 11:51:25 -05002500 * @dst_cgrp: destination cgroup to test
2501 *
Tejun Heo8cfd8142017-07-21 11:14:51 -04002502 * On the default hierarchy, except for the mixable, (possible) thread root
2503 * and threaded cgroups, subtree_control must be zero for migration
2504 * destination cgroups with tasks so that child cgroups don't compete
2505 * against tasks.
Tejun Heo6c694c82016-03-08 11:51:25 -05002506 */
Tejun Heo8cfd8142017-07-21 11:14:51 -04002507int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
Tejun Heo6c694c82016-03-08 11:51:25 -05002508{
Tejun Heo8cfd8142017-07-21 11:14:51 -04002509 /* v1 doesn't have any restriction */
2510 if (!cgroup_on_dfl(dst_cgrp))
2511 return 0;
2512
2513 /* verify @dst_cgrp can host resources */
2514 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2515 return -EOPNOTSUPP;
2516
2517 /* mixables don't care */
2518 if (cgroup_is_mixable(dst_cgrp))
2519 return 0;
2520
2521 /*
2522 * If @dst_cgrp is already or can become a thread root or is
2523 * threaded, it doesn't matter.
2524 */
2525 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2526 return 0;
2527
2528 /* apply no-internal-process constraint */
2529 if (dst_cgrp->subtree_control)
2530 return -EBUSY;
2531
2532 return 0;
Tejun Heo6c694c82016-03-08 11:51:25 -05002533}
2534
2535/**
Tejun Heo1958d2d2014-02-25 10:04:03 -05002536 * cgroup_migrate_finish - cleanup after attach
Tejun Heoe595cd72017-01-15 19:03:41 -05002537 * @mgctx: migration context
Ben Blum74a11662011-05-26 16:25:20 -07002538 *
Tejun Heo1958d2d2014-02-25 10:04:03 -05002539 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
2540 * those functions for details.
Ben Blum74a11662011-05-26 16:25:20 -07002541 */
Tejun Heoe595cd72017-01-15 19:03:41 -05002542void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
Ben Blum74a11662011-05-26 16:25:20 -07002543{
Tejun Heoe595cd72017-01-15 19:03:41 -05002544 LIST_HEAD(preloaded);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002545 struct css_set *cset, *tmp_cset;
2546
2547 lockdep_assert_held(&cgroup_mutex);
2548
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002549 spin_lock_irq(&css_set_lock);
Tejun Heoe595cd72017-01-15 19:03:41 -05002550
2551 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2552 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2553
2554 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
Tejun Heo1958d2d2014-02-25 10:04:03 -05002555 cset->mg_src_cgrp = NULL;
Tejun Heoe4857982016-03-08 11:51:26 -05002556 cset->mg_dst_cgrp = NULL;
Tejun Heo1958d2d2014-02-25 10:04:03 -05002557 cset->mg_dst_cset = NULL;
2558 list_del_init(&cset->mg_preload_node);
Zefan Lia25eb522014-09-19 16:51:00 +08002559 put_css_set_locked(cset);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002560 }
Tejun Heoe595cd72017-01-15 19:03:41 -05002561
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002562 spin_unlock_irq(&css_set_lock);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002563}
2564
2565/**
2566 * cgroup_migrate_add_src - add a migration source css_set
2567 * @src_cset: the source css_set to add
2568 * @dst_cgrp: the destination cgroup
Tejun Heoe595cd72017-01-15 19:03:41 -05002569 * @mgctx: migration context
Tejun Heo1958d2d2014-02-25 10:04:03 -05002570 *
2571 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
Tejun Heoe595cd72017-01-15 19:03:41 -05002572 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
Tejun Heo1958d2d2014-02-25 10:04:03 -05002573 * up by cgroup_migrate_finish().
2574 *
Tejun Heo1ed13282015-09-16 12:53:17 -04002575 * This function may be called without holding cgroup_threadgroup_rwsem
2576 * even if the target is a process. Threads may be created and destroyed
2577 * but as long as cgroup_mutex is not dropped, no new css_set can be put
2578 * into play and the preloaded css_sets are guaranteed to cover all
2579 * migrations.
Tejun Heo1958d2d2014-02-25 10:04:03 -05002580 */
Tejun Heo0a268db2016-12-27 14:49:06 -05002581void cgroup_migrate_add_src(struct css_set *src_cset,
2582 struct cgroup *dst_cgrp,
Tejun Heoe595cd72017-01-15 19:03:41 -05002583 struct cgroup_mgctx *mgctx)
Tejun Heo1958d2d2014-02-25 10:04:03 -05002584{
2585 struct cgroup *src_cgrp;
2586
2587 lockdep_assert_held(&cgroup_mutex);
Tejun Heof0d9a5f2015-10-15 16:41:53 -04002588 lockdep_assert_held(&css_set_lock);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002589
Tejun Heo2b021cb2016-03-15 20:43:04 -04002590 /*
2591 * If ->dead, @src_set is associated with one or more dead cgroups
2592 * and doesn't contain any migratable tasks. Ignore it early so
2593 * that the rest of migration path doesn't get confused by it.
2594 */
2595 if (src_cset->dead)
2596 return;
2597
Tejun Heo1958d2d2014-02-25 10:04:03 -05002598 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2599
Tejun Heo1958d2d2014-02-25 10:04:03 -05002600 if (!list_empty(&src_cset->mg_preload_node))
2601 return;
2602
2603 WARN_ON(src_cset->mg_src_cgrp);
Tejun Heoe4857982016-03-08 11:51:26 -05002604 WARN_ON(src_cset->mg_dst_cgrp);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002605 WARN_ON(!list_empty(&src_cset->mg_tasks));
2606 WARN_ON(!list_empty(&src_cset->mg_node));
2607
2608 src_cset->mg_src_cgrp = src_cgrp;
Tejun Heoe4857982016-03-08 11:51:26 -05002609 src_cset->mg_dst_cgrp = dst_cgrp;
Tejun Heo1958d2d2014-02-25 10:04:03 -05002610 get_css_set(src_cset);
Tejun Heoe595cd72017-01-15 19:03:41 -05002611 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002612}
2613
2614/**
2615 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
Tejun Heoe595cd72017-01-15 19:03:41 -05002616 * @mgctx: migration context
Tejun Heo1958d2d2014-02-25 10:04:03 -05002617 *
Tejun Heoe4857982016-03-08 11:51:26 -05002618 * Tasks are about to be moved and all the source css_sets have been
Tejun Heoe595cd72017-01-15 19:03:41 -05002619 * preloaded to @mgctx->preloaded_src_csets. This function looks up and
2620 * pins all destination css_sets, links each to its source, and append them
2621 * to @mgctx->preloaded_dst_csets.
Tejun Heo1958d2d2014-02-25 10:04:03 -05002622 *
2623 * This function must be called after cgroup_migrate_add_src() has been
2624 * called on each migration source css_set. After migration is performed
2625 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
Tejun Heoe595cd72017-01-15 19:03:41 -05002626 * @mgctx.
Tejun Heo1958d2d2014-02-25 10:04:03 -05002627 */
Tejun Heoe595cd72017-01-15 19:03:41 -05002628int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
Tejun Heo1958d2d2014-02-25 10:04:03 -05002629{
Tejun Heof817de92014-04-23 11:13:16 -04002630 struct css_set *src_cset, *tmp_cset;
Tejun Heo1958d2d2014-02-25 10:04:03 -05002631
2632 lockdep_assert_held(&cgroup_mutex);
2633
2634 /* look up the dst cset for each src cset and link it to src */
Tejun Heoe595cd72017-01-15 19:03:41 -05002635 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2636 mg_preload_node) {
Tejun Heo1958d2d2014-02-25 10:04:03 -05002637 struct css_set *dst_cset;
Tejun Heobfc2cf62017-01-15 19:03:41 -05002638 struct cgroup_subsys *ss;
2639 int ssid;
Tejun Heo1958d2d2014-02-25 10:04:03 -05002640
Tejun Heoe4857982016-03-08 11:51:26 -05002641 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002642 if (!dst_cset)
Shakeel Buttd6e486e2019-04-03 16:03:54 -07002643 return -ENOMEM;
Tejun Heo1958d2d2014-02-25 10:04:03 -05002644
2645 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
Tejun Heof817de92014-04-23 11:13:16 -04002646
2647 /*
2648 * If src cset equals dst, it's noop. Drop the src.
2649 * cgroup_migrate() will skip the cset too. Note that we
2650 * can't handle src == dst as some nodes are used by both.
2651 */
2652 if (src_cset == dst_cset) {
2653 src_cset->mg_src_cgrp = NULL;
Tejun Heoe4857982016-03-08 11:51:26 -05002654 src_cset->mg_dst_cgrp = NULL;
Tejun Heof817de92014-04-23 11:13:16 -04002655 list_del_init(&src_cset->mg_preload_node);
Zefan Lia25eb522014-09-19 16:51:00 +08002656 put_css_set(src_cset);
2657 put_css_set(dst_cset);
Tejun Heof817de92014-04-23 11:13:16 -04002658 continue;
2659 }
2660
Tejun Heo1958d2d2014-02-25 10:04:03 -05002661 src_cset->mg_dst_cset = dst_cset;
2662
2663 if (list_empty(&dst_cset->mg_preload_node))
Tejun Heoe595cd72017-01-15 19:03:41 -05002664 list_add_tail(&dst_cset->mg_preload_node,
2665 &mgctx->preloaded_dst_csets);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002666 else
Zefan Lia25eb522014-09-19 16:51:00 +08002667 put_css_set(dst_cset);
Tejun Heobfc2cf62017-01-15 19:03:41 -05002668
2669 for_each_subsys(ss, ssid)
2670 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2671 mgctx->ss_mask |= 1 << ssid;
Tejun Heo1958d2d2014-02-25 10:04:03 -05002672 }
2673
Tejun Heo1958d2d2014-02-25 10:04:03 -05002674 return 0;
Tejun Heo1958d2d2014-02-25 10:04:03 -05002675}
2676
2677/**
2678 * cgroup_migrate - migrate a process or task to a cgroup
Tejun Heo1958d2d2014-02-25 10:04:03 -05002679 * @leader: the leader of the process or the task to migrate
2680 * @threadgroup: whether @leader points to the whole process or a single task
Tejun Heoe595cd72017-01-15 19:03:41 -05002681 * @mgctx: migration context
Tejun Heo1958d2d2014-02-25 10:04:03 -05002682 *
Tejun Heo37ff9f82016-03-08 11:51:26 -05002683 * Migrate a process or task denoted by @leader. If migrating a process,
2684 * the caller must be holding cgroup_threadgroup_rwsem. The caller is also
2685 * responsible for invoking cgroup_migrate_add_src() and
Tejun Heo1958d2d2014-02-25 10:04:03 -05002686 * cgroup_migrate_prepare_dst() on the targets before invoking this
2687 * function and following up with cgroup_migrate_finish().
2688 *
2689 * As long as a controller's ->can_attach() doesn't fail, this function is
2690 * guaranteed to succeed. This means that, excluding ->can_attach()
2691 * failure, when migrating multiple targets, the success or failure can be
2692 * decided for all targets by invoking group_migrate_prepare_dst() before
2693 * actually starting migrating.
2694 */
Tejun Heo0a268db2016-12-27 14:49:06 -05002695int cgroup_migrate(struct task_struct *leader, bool threadgroup,
Tejun Heobfc2cf62017-01-15 19:03:41 -05002696 struct cgroup_mgctx *mgctx)
Ben Blum74a11662011-05-26 16:25:20 -07002697{
Tejun Heoadaae5d2015-09-11 15:00:21 -04002698 struct task_struct *task;
Ben Blum74a11662011-05-26 16:25:20 -07002699
2700 /*
Mandeep Singh Bainesfb5d2b42012-01-03 21:18:31 -08002701 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2702 * already PF_EXITING could be freed from underneath us unless we
2703 * take an rcu_read_lock.
2704 */
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002705 spin_lock_irq(&css_set_lock);
Mandeep Singh Bainesfb5d2b42012-01-03 21:18:31 -08002706 rcu_read_lock();
Tejun Heo9db8de32014-02-13 06:58:43 -05002707 task = leader;
Ben Blum74a11662011-05-26 16:25:20 -07002708 do {
Tejun Heoe595cd72017-01-15 19:03:41 -05002709 cgroup_migrate_add_task(task, mgctx);
Li Zefan081aa452013-03-13 09:17:09 +08002710 if (!threadgroup)
2711 break;
Tejun Heo9db8de32014-02-13 06:58:43 -05002712 } while_each_thread(leader, task);
Mandeep Singh Bainesfb5d2b42012-01-03 21:18:31 -08002713 rcu_read_unlock();
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002714 spin_unlock_irq(&css_set_lock);
Ben Blum74a11662011-05-26 16:25:20 -07002715
Tejun Heobfc2cf62017-01-15 19:03:41 -05002716 return cgroup_migrate_execute(mgctx);
Ben Blum74a11662011-05-26 16:25:20 -07002717}
2718
Tejun Heo1958d2d2014-02-25 10:04:03 -05002719/**
2720 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2721 * @dst_cgrp: the cgroup to attach to
2722 * @leader: the task or the leader of the threadgroup to be attached
2723 * @threadgroup: attach the whole threadgroup?
2724 *
Tejun Heo1ed13282015-09-16 12:53:17 -04002725 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
Tejun Heo1958d2d2014-02-25 10:04:03 -05002726 */
Tejun Heo0a268db2016-12-27 14:49:06 -05002727int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2728 bool threadgroup)
Tejun Heo1958d2d2014-02-25 10:04:03 -05002729{
Tejun Heoe595cd72017-01-15 19:03:41 -05002730 DEFINE_CGROUP_MGCTX(mgctx);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002731 struct task_struct *task;
Christian Brauner6df970e2020-02-05 14:26:18 +01002732 int ret = 0;
Tejun Heo6c694c82016-03-08 11:51:25 -05002733
Tejun Heo1958d2d2014-02-25 10:04:03 -05002734 /* look up all src csets */
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002735 spin_lock_irq(&css_set_lock);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002736 rcu_read_lock();
2737 task = leader;
2738 do {
Tejun Heoe595cd72017-01-15 19:03:41 -05002739 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002740 if (!threadgroup)
2741 break;
2742 } while_each_thread(leader, task);
2743 rcu_read_unlock();
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002744 spin_unlock_irq(&css_set_lock);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002745
2746 /* prepare dst csets and commit */
Tejun Heoe595cd72017-01-15 19:03:41 -05002747 ret = cgroup_migrate_prepare_dst(&mgctx);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002748 if (!ret)
Tejun Heobfc2cf62017-01-15 19:03:41 -05002749 ret = cgroup_migrate(leader, threadgroup, &mgctx);
Tejun Heo1958d2d2014-02-25 10:04:03 -05002750
Tejun Heoe595cd72017-01-15 19:03:41 -05002751 cgroup_migrate_finish(&mgctx);
Tejun Heoed1777d2016-08-10 11:23:44 -04002752
2753 if (!ret)
Steven Rostedt (VMware)e4f8d812018-07-09 17:48:54 -04002754 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
Tejun Heoed1777d2016-08-10 11:23:44 -04002755
Tejun Heo1958d2d2014-02-25 10:04:03 -05002756 return ret;
Ben Blum74a11662011-05-26 16:25:20 -07002757}
2758
Michal Koutný9a3284f2019-10-04 12:57:40 +02002759struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2760 bool *locked)
Tejun Heo715c8092017-05-15 09:34:00 -04002761 __acquires(&cgroup_threadgroup_rwsem)
Paul Menagebbcb81d2007-10-18 23:39:32 -07002762{
Paul Menagebbcb81d2007-10-18 23:39:32 -07002763 struct task_struct *tsk;
Tejun Heoacbef752014-05-13 12:16:22 -04002764 pid_t pid;
Paul Menagebbcb81d2007-10-18 23:39:32 -07002765
Tejun Heoacbef752014-05-13 12:16:22 -04002766 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
Tejun Heo715c8092017-05-15 09:34:00 -04002767 return ERR_PTR(-EINVAL);
Ben Blum74a11662011-05-26 16:25:20 -07002768
Michal Koutný9a3284f2019-10-04 12:57:40 +02002769 /*
2770 * If we migrate a single thread, we don't care about threadgroup
2771 * stability. If the thread is `current`, it won't exit(2) under our
2772 * hands or change PID through exec(2). We exclude
2773 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
2774 * callers by cgroup_mutex.
2775 * Therefore, we can skip the global lock.
2776 */
2777 lockdep_assert_held(&cgroup_mutex);
2778 if (pid || threadgroup) {
2779 percpu_down_write(&cgroup_threadgroup_rwsem);
2780 *locked = true;
2781 } else {
2782 *locked = false;
2783 }
Tejun Heo715c8092017-05-15 09:34:00 -04002784
Mandeep Singh Bainesb78949e2012-01-03 21:18:30 -08002785 rcu_read_lock();
Paul Menagebbcb81d2007-10-18 23:39:32 -07002786 if (pid) {
Pavel Emelyanov73507f32008-02-07 00:14:47 -08002787 tsk = find_task_by_vpid(pid);
Ben Blum74a11662011-05-26 16:25:20 -07002788 if (!tsk) {
Tejun Heo715c8092017-05-15 09:34:00 -04002789 tsk = ERR_PTR(-ESRCH);
2790 goto out_unlock_threadgroup;
Paul Menagebbcb81d2007-10-18 23:39:32 -07002791 }
Tejun Heodedf22e2015-06-18 16:54:28 -04002792 } else {
Mandeep Singh Bainesb78949e2012-01-03 21:18:30 -08002793 tsk = current;
Tejun Heodedf22e2015-06-18 16:54:28 -04002794 }
Tejun Heocd3d0952011-12-12 18:12:21 -08002795
2796 if (threadgroup)
Mandeep Singh Bainesb78949e2012-01-03 21:18:30 -08002797 tsk = tsk->group_leader;
Mike Galbraithc4c27fb2012-04-21 09:13:46 +02002798
2799 /*
Tejun Heo77f88792017-03-16 16:54:24 -04002800 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
2801 * If userland migrates such a kthread to a non-root cgroup, it can
2802 * become trapped in a cpuset, or RT kthread may be born in a
2803 * cgroup with no rt_runtime allocated. Just say no.
Mike Galbraithc4c27fb2012-04-21 09:13:46 +02002804 */
Tejun Heo77f88792017-03-16 16:54:24 -04002805 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
Tejun Heo715c8092017-05-15 09:34:00 -04002806 tsk = ERR_PTR(-EINVAL);
2807 goto out_unlock_threadgroup;
Mike Galbraithc4c27fb2012-04-21 09:13:46 +02002808 }
2809
Mandeep Singh Bainesb78949e2012-01-03 21:18:30 -08002810 get_task_struct(tsk);
Tejun Heo715c8092017-05-15 09:34:00 -04002811 goto out_unlock_rcu;
Tejun Heocd3d0952011-12-12 18:12:21 -08002812
Tejun Heo715c8092017-05-15 09:34:00 -04002813out_unlock_threadgroup:
Michal Koutný9a3284f2019-10-04 12:57:40 +02002814 if (*locked) {
2815 percpu_up_write(&cgroup_threadgroup_rwsem);
2816 *locked = false;
2817 }
Tejun Heo3014dde2015-09-16 13:03:02 -04002818out_unlock_rcu:
2819 rcu_read_unlock();
Tejun Heo715c8092017-05-15 09:34:00 -04002820 return tsk;
2821}
2822
Michal Koutný9a3284f2019-10-04 12:57:40 +02002823void cgroup_procs_write_finish(struct task_struct *task, bool locked)
Tejun Heo715c8092017-05-15 09:34:00 -04002824 __releases(&cgroup_threadgroup_rwsem)
2825{
2826 struct cgroup_subsys *ss;
2827 int ssid;
2828
2829 /* release reference from cgroup_procs_write_start() */
2830 put_task_struct(task);
2831
Michal Koutný9a3284f2019-10-04 12:57:40 +02002832 if (locked)
2833 percpu_up_write(&cgroup_threadgroup_rwsem);
Tejun Heo5cf1cac2016-04-21 19:06:48 -04002834 for_each_subsys(ss, ssid)
2835 if (ss->post_attach)
2836 ss->post_attach();
Paul Menageaf351022008-07-25 01:47:01 -07002837}
2838
Tejun Heo6e5c8302016-02-22 22:25:47 -05002839static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
Tejun Heof8f22e52014-04-23 11:13:16 -04002840{
2841 struct cgroup_subsys *ss;
2842 bool printed = false;
2843 int ssid;
2844
Tejun Heob4e0eea2016-02-22 22:25:46 -05002845 do_each_subsys_mask(ss, ssid, ss_mask) {
Aleksa Saraia966a4e2015-06-06 10:02:15 +10002846 if (printed)
2847 seq_putc(seq, ' ');
Markus Elfring85db0022019-07-02 19:26:59 +02002848 seq_puts(seq, ss->name);
Aleksa Saraia966a4e2015-06-06 10:02:15 +10002849 printed = true;
Tejun Heob4e0eea2016-02-22 22:25:46 -05002850 } while_each_subsys_mask();
Tejun Heof8f22e52014-04-23 11:13:16 -04002851 if (printed)
2852 seq_putc(seq, '\n');
2853}
2854
Tejun Heof8f22e52014-04-23 11:13:16 -04002855/* show controllers which are enabled from the parent */
2856static int cgroup_controllers_show(struct seq_file *seq, void *v)
2857{
2858 struct cgroup *cgrp = seq_css(seq)->cgroup;
2859
Tejun Heo5531dc92016-03-03 09:57:58 -05002860 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
Tejun Heof8f22e52014-04-23 11:13:16 -04002861 return 0;
2862}
2863
2864/* show controllers which are enabled for a given cgroup's children */
2865static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2866{
2867 struct cgroup *cgrp = seq_css(seq)->cgroup;
2868
Tejun Heo667c2492014-07-08 18:02:56 -04002869 cgroup_print_ss_mask(seq, cgrp->subtree_control);
Tejun Heof8f22e52014-04-23 11:13:16 -04002870 return 0;
2871}
2872
2873/**
2874 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2875 * @cgrp: root of the subtree to update csses for
2876 *
Tejun Heo54962602016-03-03 09:58:01 -05002877 * @cgrp's control masks have changed and its subtree's css associations
2878 * need to be updated accordingly. This function looks up all css_sets
2879 * which are attached to the subtree, creates the matching updated css_sets
2880 * and migrates the tasks to the new ones.
Tejun Heof8f22e52014-04-23 11:13:16 -04002881 */
2882static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2883{
Tejun Heoe595cd72017-01-15 19:03:41 -05002884 DEFINE_CGROUP_MGCTX(mgctx);
Tejun Heo54962602016-03-03 09:58:01 -05002885 struct cgroup_subsys_state *d_css;
2886 struct cgroup *dsct;
Tejun Heof8f22e52014-04-23 11:13:16 -04002887 struct css_set *src_cset;
2888 int ret;
2889
Tejun Heof8f22e52014-04-23 11:13:16 -04002890 lockdep_assert_held(&cgroup_mutex);
2891
Tejun Heo3014dde2015-09-16 13:03:02 -04002892 percpu_down_write(&cgroup_threadgroup_rwsem);
2893
Tejun Heof8f22e52014-04-23 11:13:16 -04002894 /* look up all csses currently attached to @cgrp's subtree */
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002895 spin_lock_irq(&css_set_lock);
Tejun Heo54962602016-03-03 09:58:01 -05002896 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
Tejun Heof8f22e52014-04-23 11:13:16 -04002897 struct cgrp_cset_link *link;
2898
Tejun Heo54962602016-03-03 09:58:01 -05002899 list_for_each_entry(link, &dsct->cset_links, cset_link)
Tejun Heoe595cd72017-01-15 19:03:41 -05002900 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
Tejun Heof8f22e52014-04-23 11:13:16 -04002901 }
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002902 spin_unlock_irq(&css_set_lock);
Tejun Heof8f22e52014-04-23 11:13:16 -04002903
2904 /* NULL dst indicates self on default hierarchy */
Tejun Heoe595cd72017-01-15 19:03:41 -05002905 ret = cgroup_migrate_prepare_dst(&mgctx);
Tejun Heof8f22e52014-04-23 11:13:16 -04002906 if (ret)
2907 goto out_finish;
2908
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002909 spin_lock_irq(&css_set_lock);
Tejun Heoe595cd72017-01-15 19:03:41 -05002910 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
Tejun Heo10265072015-09-11 15:00:22 -04002911 struct task_struct *task, *ntask;
Tejun Heof8f22e52014-04-23 11:13:16 -04002912
Tejun Heo10265072015-09-11 15:00:22 -04002913 /* all tasks in src_csets need to be migrated */
2914 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
Tejun Heoe595cd72017-01-15 19:03:41 -05002915 cgroup_migrate_add_task(task, &mgctx);
Tejun Heof8f22e52014-04-23 11:13:16 -04002916 }
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03002917 spin_unlock_irq(&css_set_lock);
Tejun Heof8f22e52014-04-23 11:13:16 -04002918
Tejun Heobfc2cf62017-01-15 19:03:41 -05002919 ret = cgroup_migrate_execute(&mgctx);
Tejun Heof8f22e52014-04-23 11:13:16 -04002920out_finish:
Tejun Heoe595cd72017-01-15 19:03:41 -05002921 cgroup_migrate_finish(&mgctx);
Tejun Heo3014dde2015-09-16 13:03:02 -04002922 percpu_up_write(&cgroup_threadgroup_rwsem);
Tejun Heof8f22e52014-04-23 11:13:16 -04002923 return ret;
2924}
2925
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002926/**
Tejun Heo945ba192016-03-03 09:58:00 -05002927 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
Tejun Heoce3f1d92016-03-03 09:57:59 -05002928 * @cgrp: root of the target subtree
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002929 *
2930 * Because css offlining is asynchronous, userland may try to re-enable a
Tejun Heo945ba192016-03-03 09:58:00 -05002931 * controller while the previous css is still around. This function grabs
2932 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002933 */
Tejun Heo0a268db2016-12-27 14:49:06 -05002934void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
Tejun Heo945ba192016-03-03 09:58:00 -05002935 __acquires(&cgroup_mutex)
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002936{
2937 struct cgroup *dsct;
Tejun Heoce3f1d92016-03-03 09:57:59 -05002938 struct cgroup_subsys_state *d_css;
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002939 struct cgroup_subsys *ss;
2940 int ssid;
2941
Tejun Heo945ba192016-03-03 09:58:00 -05002942restart:
2943 mutex_lock(&cgroup_mutex);
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002944
Tejun Heoce3f1d92016-03-03 09:57:59 -05002945 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002946 for_each_subsys(ss, ssid) {
2947 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2948 DEFINE_WAIT(wait);
2949
Tejun Heoce3f1d92016-03-03 09:57:59 -05002950 if (!css || !percpu_ref_is_dying(&css->refcnt))
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002951 continue;
2952
Tejun Heoa590b902017-04-28 15:14:55 -04002953 cgroup_get_live(dsct);
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002954 prepare_to_wait(&dsct->offline_waitq, &wait,
2955 TASK_UNINTERRUPTIBLE);
2956
2957 mutex_unlock(&cgroup_mutex);
2958 schedule();
2959 finish_wait(&dsct->offline_waitq, &wait);
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002960
2961 cgroup_put(dsct);
Tejun Heo945ba192016-03-03 09:58:00 -05002962 goto restart;
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002963 }
2964 }
Tejun Heo1b9b96a2016-03-03 09:57:59 -05002965}
2966
Tejun Heo12b3bb62016-03-03 09:57:59 -05002967/**
Tejun Heo479adb82018-10-04 13:28:08 -07002968 * cgroup_save_control - save control masks and dom_cgrp of a subtree
Tejun Heo15a27c32016-03-03 09:57:59 -05002969 * @cgrp: root of the target subtree
2970 *
Tejun Heo479adb82018-10-04 13:28:08 -07002971 * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
2972 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
2973 * itself.
Tejun Heo15a27c32016-03-03 09:57:59 -05002974 */
2975static void cgroup_save_control(struct cgroup *cgrp)
2976{
2977 struct cgroup *dsct;
2978 struct cgroup_subsys_state *d_css;
2979
2980 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2981 dsct->old_subtree_control = dsct->subtree_control;
2982 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
Tejun Heo479adb82018-10-04 13:28:08 -07002983 dsct->old_dom_cgrp = dsct->dom_cgrp;
Tejun Heo15a27c32016-03-03 09:57:59 -05002984 }
2985}
2986
2987/**
2988 * cgroup_propagate_control - refresh control masks of a subtree
2989 * @cgrp: root of the target subtree
2990 *
2991 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
2992 * ->subtree_control and propagate controller availability through the
2993 * subtree so that descendants don't have unavailable controllers enabled.
2994 */
2995static void cgroup_propagate_control(struct cgroup *cgrp)
2996{
2997 struct cgroup *dsct;
2998 struct cgroup_subsys_state *d_css;
2999
3000 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3001 dsct->subtree_control &= cgroup_control(dsct);
Tejun Heo5ced2512016-03-03 09:58:01 -05003002 dsct->subtree_ss_mask =
3003 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3004 cgroup_ss_mask(dsct));
Tejun Heo15a27c32016-03-03 09:57:59 -05003005 }
3006}
3007
3008/**
Tejun Heo479adb82018-10-04 13:28:08 -07003009 * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
Tejun Heo15a27c32016-03-03 09:57:59 -05003010 * @cgrp: root of the target subtree
3011 *
Tejun Heo479adb82018-10-04 13:28:08 -07003012 * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
3013 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
3014 * itself.
Tejun Heo15a27c32016-03-03 09:57:59 -05003015 */
3016static void cgroup_restore_control(struct cgroup *cgrp)
3017{
3018 struct cgroup *dsct;
3019 struct cgroup_subsys_state *d_css;
3020
3021 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3022 dsct->subtree_control = dsct->old_subtree_control;
3023 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
Tejun Heo479adb82018-10-04 13:28:08 -07003024 dsct->dom_cgrp = dsct->old_dom_cgrp;
Tejun Heo15a27c32016-03-03 09:57:59 -05003025 }
3026}
3027
Tejun Heof6d635ad2016-03-08 11:51:26 -05003028static bool css_visible(struct cgroup_subsys_state *css)
3029{
3030 struct cgroup_subsys *ss = css->ss;
3031 struct cgroup *cgrp = css->cgroup;
3032
3033 if (cgroup_control(cgrp) & (1 << ss->id))
3034 return true;
3035 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3036 return false;
3037 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3038}
3039
Tejun Heo15a27c32016-03-03 09:57:59 -05003040/**
Tejun Heobdb53bd2016-03-03 09:57:59 -05003041 * cgroup_apply_control_enable - enable or show csses according to control
Tejun Heoce3f1d92016-03-03 09:57:59 -05003042 * @cgrp: root of the target subtree
Tejun Heobdb53bd2016-03-03 09:57:59 -05003043 *
Tejun Heoce3f1d92016-03-03 09:57:59 -05003044 * Walk @cgrp's subtree and create new csses or make the existing ones
Tejun Heobdb53bd2016-03-03 09:57:59 -05003045 * visible. A css is created invisible if it's being implicitly enabled
3046 * through dependency. An invisible css is made visible when the userland
3047 * explicitly enables it.
3048 *
3049 * Returns 0 on success, -errno on failure. On failure, csses which have
3050 * been processed already aren't cleaned up. The caller is responsible for
Masahiro Yamada8a1115f2017-03-09 16:16:31 -08003051 * cleaning up with cgroup_apply_control_disable().
Tejun Heobdb53bd2016-03-03 09:57:59 -05003052 */
3053static int cgroup_apply_control_enable(struct cgroup *cgrp)
3054{
3055 struct cgroup *dsct;
Tejun Heoce3f1d92016-03-03 09:57:59 -05003056 struct cgroup_subsys_state *d_css;
Tejun Heobdb53bd2016-03-03 09:57:59 -05003057 struct cgroup_subsys *ss;
3058 int ssid, ret;
3059
Tejun Heoce3f1d92016-03-03 09:57:59 -05003060 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
Tejun Heobdb53bd2016-03-03 09:57:59 -05003061 for_each_subsys(ss, ssid) {
3062 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3063
3064 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3065 continue;
3066
3067 if (!css) {
3068 css = css_create(dsct, ss);
3069 if (IS_ERR(css))
3070 return PTR_ERR(css);
3071 }
3072
Michal Koutný3bc0bb32020-01-09 16:05:59 +01003073 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3074
Tejun Heof6d635ad2016-03-08 11:51:26 -05003075 if (css_visible(css)) {
Tejun Heo334c3672016-03-03 09:58:01 -05003076 ret = css_populate_dir(css);
Tejun Heobdb53bd2016-03-03 09:57:59 -05003077 if (ret)
3078 return ret;
3079 }
3080 }
3081 }
3082
3083 return 0;
3084}
3085
3086/**
Tejun Heo12b3bb62016-03-03 09:57:59 -05003087 * cgroup_apply_control_disable - kill or hide csses according to control
Tejun Heoce3f1d92016-03-03 09:57:59 -05003088 * @cgrp: root of the target subtree
Tejun Heo12b3bb62016-03-03 09:57:59 -05003089 *
Tejun Heoce3f1d92016-03-03 09:57:59 -05003090 * Walk @cgrp's subtree and kill and hide csses so that they match
Tejun Heo12b3bb62016-03-03 09:57:59 -05003091 * cgroup_ss_mask() and cgroup_visible_mask().
3092 *
3093 * A css is hidden when the userland requests it to be disabled while other
3094 * subsystems are still depending on it. The css must not actively control
3095 * resources and be in the vanilla state if it's made visible again later.
3096 * Controllers which may be depended upon should provide ->css_reset() for
3097 * this purpose.
3098 */
3099static void cgroup_apply_control_disable(struct cgroup *cgrp)
3100{
3101 struct cgroup *dsct;
Tejun Heoce3f1d92016-03-03 09:57:59 -05003102 struct cgroup_subsys_state *d_css;
Tejun Heo12b3bb62016-03-03 09:57:59 -05003103 struct cgroup_subsys *ss;
3104 int ssid;
3105
Tejun Heoce3f1d92016-03-03 09:57:59 -05003106 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
Tejun Heo12b3bb62016-03-03 09:57:59 -05003107 for_each_subsys(ss, ssid) {
3108 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3109
3110 if (!css)
3111 continue;
3112
Michal Koutný3bc0bb32020-01-09 16:05:59 +01003113 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3114
Tejun Heo334c3672016-03-03 09:58:01 -05003115 if (css->parent &&
3116 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
Tejun Heo12b3bb62016-03-03 09:57:59 -05003117 kill_css(css);
Tejun Heof6d635ad2016-03-08 11:51:26 -05003118 } else if (!css_visible(css)) {
Tejun Heo334c3672016-03-03 09:58:01 -05003119 css_clear_dir(css);
Tejun Heo12b3bb62016-03-03 09:57:59 -05003120 if (ss->css_reset)
3121 ss->css_reset(css);
3122 }
3123 }
3124 }
3125}
3126
Tejun Heof7b28142016-03-03 09:58:00 -05003127/**
3128 * cgroup_apply_control - apply control mask updates to the subtree
3129 * @cgrp: root of the target subtree
3130 *
3131 * subsystems can be enabled and disabled in a subtree using the following
3132 * steps.
3133 *
3134 * 1. Call cgroup_save_control() to stash the current state.
3135 * 2. Update ->subtree_control masks in the subtree as desired.
3136 * 3. Call cgroup_apply_control() to apply the changes.
3137 * 4. Optionally perform other related operations.
3138 * 5. Call cgroup_finalize_control() to finish up.
3139 *
3140 * This function implements step 3 and propagates the mask changes
3141 * throughout @cgrp's subtree, updates csses accordingly and perform
3142 * process migrations.
3143 */
3144static int cgroup_apply_control(struct cgroup *cgrp)
3145{
3146 int ret;
3147
3148 cgroup_propagate_control(cgrp);
3149
3150 ret = cgroup_apply_control_enable(cgrp);
3151 if (ret)
3152 return ret;
3153
3154 /*
Dennis Zhoufc5a8282018-12-05 12:10:36 -05003155 * At this point, cgroup_e_css_by_mask() results reflect the new csses
Tejun Heof7b28142016-03-03 09:58:00 -05003156 * making the following cgroup_update_dfl_csses() properly update
3157 * css associations of all tasks in the subtree.
3158 */
3159 ret = cgroup_update_dfl_csses(cgrp);
3160 if (ret)
3161 return ret;
3162
3163 return 0;
3164}
3165
3166/**
3167 * cgroup_finalize_control - finalize control mask update
3168 * @cgrp: root of the target subtree
3169 * @ret: the result of the update
3170 *
3171 * Finalize control mask update. See cgroup_apply_control() for more info.
3172 */
3173static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3174{
3175 if (ret) {
3176 cgroup_restore_control(cgrp);
3177 cgroup_propagate_control(cgrp);
3178 }
3179
3180 cgroup_apply_control_disable(cgrp);
3181}
3182
Tejun Heo8cfd8142017-07-21 11:14:51 -04003183static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3184{
3185 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3186
3187 /* if nothing is getting enabled, nothing to worry about */
3188 if (!enable)
3189 return 0;
3190
3191 /* can @cgrp host any resources? */
3192 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3193 return -EOPNOTSUPP;
3194
3195 /* mixables don't care */
3196 if (cgroup_is_mixable(cgrp))
3197 return 0;
3198
3199 if (domain_enable) {
3200 /* can't enable domain controllers inside a thread subtree */
3201 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3202 return -EOPNOTSUPP;
3203 } else {
3204 /*
3205 * Threaded controllers can handle internal competitions
3206 * and are always allowed inside a (prospective) thread
3207 * subtree.
3208 */
3209 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3210 return 0;
3211 }
3212
3213 /*
3214 * Controllers can't be enabled for a cgroup with tasks to avoid
3215 * child cgroups competing against tasks.
3216 */
3217 if (cgroup_has_tasks(cgrp))
3218 return -EBUSY;
3219
3220 return 0;
3221}
3222
Tejun Heof8f22e52014-04-23 11:13:16 -04003223/* change the enabled child controllers for a cgroup in the default hierarchy */
Tejun Heo451af502014-05-13 12:16:21 -04003224static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3225 char *buf, size_t nbytes,
3226 loff_t off)
Tejun Heof8f22e52014-04-23 11:13:16 -04003227{
Tejun Heo6e5c8302016-02-22 22:25:47 -05003228 u16 enable = 0, disable = 0;
Tejun Heoa9746d82014-05-13 12:19:22 -04003229 struct cgroup *cgrp, *child;
Tejun Heof8f22e52014-04-23 11:13:16 -04003230 struct cgroup_subsys *ss;
Tejun Heo451af502014-05-13 12:16:21 -04003231 char *tok;
Tejun Heof8f22e52014-04-23 11:13:16 -04003232 int ssid, ret;
3233
3234 /*
Tejun Heod37167a2014-05-13 12:10:59 -04003235 * Parse input - space separated list of subsystem names prefixed
3236 * with either + or -.
Tejun Heof8f22e52014-04-23 11:13:16 -04003237 */
Tejun Heo451af502014-05-13 12:16:21 -04003238 buf = strstrip(buf);
3239 while ((tok = strsep(&buf, " "))) {
Tejun Heod37167a2014-05-13 12:10:59 -04003240 if (tok[0] == '\0')
3241 continue;
Tejun Heoa7165262016-02-23 10:00:50 -05003242 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
Tejun Heofc5ed1e2015-09-18 11:56:28 -04003243 if (!cgroup_ssid_enabled(ssid) ||
3244 strcmp(tok + 1, ss->name))
Tejun Heof8f22e52014-04-23 11:13:16 -04003245 continue;
3246
3247 if (*tok == '+') {
Tejun Heo7d331fa2014-05-13 12:11:00 -04003248 enable |= 1 << ssid;
3249 disable &= ~(1 << ssid);
Tejun Heof8f22e52014-04-23 11:13:16 -04003250 } else if (*tok == '-') {
Tejun Heo7d331fa2014-05-13 12:11:00 -04003251 disable |= 1 << ssid;
3252 enable &= ~(1 << ssid);
Tejun Heof8f22e52014-04-23 11:13:16 -04003253 } else {
3254 return -EINVAL;
3255 }
3256 break;
Tejun Heob4e0eea2016-02-22 22:25:46 -05003257 } while_each_subsys_mask();
Tejun Heof8f22e52014-04-23 11:13:16 -04003258 if (ssid == CGROUP_SUBSYS_COUNT)
3259 return -EINVAL;
3260 }
3261
Tejun Heo945ba192016-03-03 09:58:00 -05003262 cgrp = cgroup_kn_lock_live(of->kn, true);
Tejun Heoa9746d82014-05-13 12:19:22 -04003263 if (!cgrp)
3264 return -ENODEV;
Tejun Heof8f22e52014-04-23 11:13:16 -04003265
3266 for_each_subsys(ss, ssid) {
3267 if (enable & (1 << ssid)) {
Tejun Heo667c2492014-07-08 18:02:56 -04003268 if (cgrp->subtree_control & (1 << ssid)) {
Tejun Heof8f22e52014-04-23 11:13:16 -04003269 enable &= ~(1 << ssid);
3270 continue;
3271 }
3272
Tejun Heo5531dc92016-03-03 09:57:58 -05003273 if (!(cgroup_control(cgrp) & (1 << ssid))) {
Tejun Heoc29adf22014-07-08 18:02:56 -04003274 ret = -ENOENT;
3275 goto out_unlock;
3276 }
Tejun Heof8f22e52014-04-23 11:13:16 -04003277 } else if (disable & (1 << ssid)) {
Tejun Heo667c2492014-07-08 18:02:56 -04003278 if (!(cgrp->subtree_control & (1 << ssid))) {
Tejun Heof8f22e52014-04-23 11:13:16 -04003279 disable &= ~(1 << ssid);
3280 continue;
3281 }
3282
3283 /* a child has it enabled? */
3284 cgroup_for_each_live_child(child, cgrp) {
Tejun Heo667c2492014-07-08 18:02:56 -04003285 if (child->subtree_control & (1 << ssid)) {
Tejun Heof8f22e52014-04-23 11:13:16 -04003286 ret = -EBUSY;
Tejun Heoddab2b62014-05-13 12:19:22 -04003287 goto out_unlock;
Tejun Heof8f22e52014-04-23 11:13:16 -04003288 }
3289 }
3290 }
3291 }
3292
3293 if (!enable && !disable) {
3294 ret = 0;
Tejun Heoddab2b62014-05-13 12:19:22 -04003295 goto out_unlock;
Tejun Heof8f22e52014-04-23 11:13:16 -04003296 }
3297
Tejun Heo8cfd8142017-07-21 11:14:51 -04003298 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3299 if (ret)
Tejun Heo27f26752017-07-16 21:44:18 -04003300 goto out_unlock;
Tejun Heof8f22e52014-04-23 11:13:16 -04003301
Tejun Heo15a27c32016-03-03 09:57:59 -05003302 /* save and update control masks and prepare csses */
3303 cgroup_save_control(cgrp);
Tejun Heoc29adf22014-07-08 18:02:56 -04003304
Tejun Heo15a27c32016-03-03 09:57:59 -05003305 cgrp->subtree_control |= enable;
3306 cgrp->subtree_control &= ~disable;
Tejun Heof63070d2014-07-08 18:02:57 -04003307
Tejun Heof7b28142016-03-03 09:58:00 -05003308 ret = cgroup_apply_control(cgrp);
Tejun Heof7b28142016-03-03 09:58:00 -05003309 cgroup_finalize_control(cgrp, ret);
Tejun Heo3c745412017-07-23 08:14:15 -04003310 if (ret)
3311 goto out_unlock;
Tejun Heof8f22e52014-04-23 11:13:16 -04003312
3313 kernfs_activate(cgrp->kn);
Tejun Heof8f22e52014-04-23 11:13:16 -04003314out_unlock:
Tejun Heoa9746d82014-05-13 12:19:22 -04003315 cgroup_kn_unlock(of->kn);
Tejun Heo451af502014-05-13 12:16:21 -04003316 return ret ?: nbytes;
Tejun Heof8f22e52014-04-23 11:13:16 -04003317}
3318
Tejun Heoc705a002017-07-25 13:20:18 -04003319/**
3320 * cgroup_enable_threaded - make @cgrp threaded
3321 * @cgrp: the target cgroup
3322 *
3323 * Called when "threaded" is written to the cgroup.type interface file and
3324 * tries to make @cgrp threaded and join the parent's resource domain.
3325 * This function is never called on the root cgroup as cgroup.type doesn't
3326 * exist on it.
3327 */
Tejun Heo8cfd8142017-07-21 11:14:51 -04003328static int cgroup_enable_threaded(struct cgroup *cgrp)
3329{
3330 struct cgroup *parent = cgroup_parent(cgrp);
3331 struct cgroup *dom_cgrp = parent->dom_cgrp;
Tejun Heo479adb82018-10-04 13:28:08 -07003332 struct cgroup *dsct;
3333 struct cgroup_subsys_state *d_css;
Tejun Heo8cfd8142017-07-21 11:14:51 -04003334 int ret;
3335
3336 lockdep_assert_held(&cgroup_mutex);
3337
3338 /* noop if already threaded */
3339 if (cgroup_is_threaded(cgrp))
3340 return 0;
3341
Tejun Heod1897c92018-02-21 11:39:22 -08003342 /*
3343 * If @cgroup is populated or has domain controllers enabled, it
3344 * can't be switched. While the below cgroup_can_be_thread_root()
3345 * test can catch the same conditions, that's only when @parent is
3346 * not mixable, so let's check it explicitly.
3347 */
3348 if (cgroup_is_populated(cgrp) ||
3349 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3350 return -EOPNOTSUPP;
3351
Tejun Heo8cfd8142017-07-21 11:14:51 -04003352 /* we're joining the parent's domain, ensure its validity */
3353 if (!cgroup_is_valid_domain(dom_cgrp) ||
3354 !cgroup_can_be_thread_root(dom_cgrp))
3355 return -EOPNOTSUPP;
3356
3357 /*
Tejun Heo8cfd8142017-07-21 11:14:51 -04003358 * The following shouldn't cause actual migrations and should
3359 * always succeed.
3360 */
3361 cgroup_save_control(cgrp);
3362
Tejun Heo479adb82018-10-04 13:28:08 -07003363 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3364 if (dsct == cgrp || cgroup_is_threaded(dsct))
3365 dsct->dom_cgrp = dom_cgrp;
3366
Tejun Heo8cfd8142017-07-21 11:14:51 -04003367 ret = cgroup_apply_control(cgrp);
3368 if (!ret)
3369 parent->nr_threaded_children++;
Tejun Heo8cfd8142017-07-21 11:14:51 -04003370
3371 cgroup_finalize_control(cgrp, ret);
3372 return ret;
3373}
3374
3375static int cgroup_type_show(struct seq_file *seq, void *v)
3376{
3377 struct cgroup *cgrp = seq_css(seq)->cgroup;
3378
3379 if (cgroup_is_threaded(cgrp))
3380 seq_puts(seq, "threaded\n");
3381 else if (!cgroup_is_valid_domain(cgrp))
3382 seq_puts(seq, "domain invalid\n");
3383 else if (cgroup_is_thread_root(cgrp))
3384 seq_puts(seq, "domain threaded\n");
3385 else
3386 seq_puts(seq, "domain\n");
3387
3388 return 0;
3389}
3390
3391static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3392 size_t nbytes, loff_t off)
3393{
3394 struct cgroup *cgrp;
3395 int ret;
3396
3397 /* only switching to threaded mode is supported */
3398 if (strcmp(strstrip(buf), "threaded"))
3399 return -EINVAL;
3400
Michal Koutný3bc0bb32020-01-09 16:05:59 +01003401 /* drain dying csses before we re-apply (threaded) subtree control */
3402 cgrp = cgroup_kn_lock_live(of->kn, true);
Tejun Heo8cfd8142017-07-21 11:14:51 -04003403 if (!cgrp)
3404 return -ENOENT;
3405
3406 /* threaded can only be enabled */
3407 ret = cgroup_enable_threaded(cgrp);
3408
3409 cgroup_kn_unlock(of->kn);
3410 return ret ?: nbytes;
3411}
3412
Roman Gushchin1a926e02017-07-28 18:28:44 +01003413static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3414{
3415 struct cgroup *cgrp = seq_css(seq)->cgroup;
3416 int descendants = READ_ONCE(cgrp->max_descendants);
3417
3418 if (descendants == INT_MAX)
3419 seq_puts(seq, "max\n");
3420 else
3421 seq_printf(seq, "%d\n", descendants);
3422
3423 return 0;
3424}
3425
3426static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3427 char *buf, size_t nbytes, loff_t off)
3428{
3429 struct cgroup *cgrp;
3430 int descendants;
3431 ssize_t ret;
3432
3433 buf = strstrip(buf);
3434 if (!strcmp(buf, "max")) {
3435 descendants = INT_MAX;
3436 } else {
3437 ret = kstrtoint(buf, 0, &descendants);
3438 if (ret)
3439 return ret;
3440 }
3441
Dan Carpenter696b98f2017-08-09 13:25:21 +03003442 if (descendants < 0)
Roman Gushchin1a926e02017-07-28 18:28:44 +01003443 return -ERANGE;
3444
3445 cgrp = cgroup_kn_lock_live(of->kn, false);
3446 if (!cgrp)
3447 return -ENOENT;
3448
3449 cgrp->max_descendants = descendants;
3450
3451 cgroup_kn_unlock(of->kn);
3452
3453 return nbytes;
3454}
3455
3456static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3457{
3458 struct cgroup *cgrp = seq_css(seq)->cgroup;
3459 int depth = READ_ONCE(cgrp->max_depth);
3460
3461 if (depth == INT_MAX)
3462 seq_puts(seq, "max\n");
3463 else
3464 seq_printf(seq, "%d\n", depth);
3465
3466 return 0;
3467}
3468
3469static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3470 char *buf, size_t nbytes, loff_t off)
3471{
3472 struct cgroup *cgrp;
3473 ssize_t ret;
3474 int depth;
3475
3476 buf = strstrip(buf);
3477 if (!strcmp(buf, "max")) {
3478 depth = INT_MAX;
3479 } else {
3480 ret = kstrtoint(buf, 0, &depth);
3481 if (ret)
3482 return ret;
3483 }
3484
Dan Carpenter696b98f2017-08-09 13:25:21 +03003485 if (depth < 0)
Roman Gushchin1a926e02017-07-28 18:28:44 +01003486 return -ERANGE;
3487
3488 cgrp = cgroup_kn_lock_live(of->kn, false);
3489 if (!cgrp)
3490 return -ENOENT;
3491
3492 cgrp->max_depth = depth;
3493
3494 cgroup_kn_unlock(of->kn);
3495
3496 return nbytes;
3497}
3498
Tejun Heo4a07c222015-09-18 17:54:22 -04003499static int cgroup_events_show(struct seq_file *seq, void *v)
Tejun Heo842b5972014-04-25 18:28:02 -04003500{
Roman Gushchin76f969e2019-04-19 10:03:04 -07003501 struct cgroup *cgrp = seq_css(seq)->cgroup;
3502
3503 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3504 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3505
Tejun Heo842b5972014-04-25 18:28:02 -04003506 return 0;
3507}
3508
Tejun Heo3e489302017-08-11 05:49:01 -07003509static int cgroup_stat_show(struct seq_file *seq, void *v)
Roman Gushchinec392252017-08-02 17:55:31 +01003510{
3511 struct cgroup *cgroup = seq_css(seq)->cgroup;
3512
3513 seq_printf(seq, "nr_descendants %d\n",
3514 cgroup->nr_descendants);
3515 seq_printf(seq, "nr_dying_descendants %d\n",
3516 cgroup->nr_dying_descendants);
3517
3518 return 0;
3519}
3520
Tejun Heod41bf8c2017-10-23 16:18:27 -07003521static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3522 struct cgroup *cgrp, int ssid)
3523{
3524 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3525 struct cgroup_subsys_state *css;
3526 int ret;
3527
3528 if (!ss->css_extra_stat_show)
3529 return 0;
3530
3531 css = cgroup_tryget_css(cgrp, ss);
3532 if (!css)
3533 return 0;
3534
3535 ret = ss->css_extra_stat_show(seq, css);
3536 css_put(css);
3537 return ret;
3538}
3539
3540static int cpu_stat_show(struct seq_file *seq, void *v)
3541{
Tejun Heoc3ba1322017-10-30 08:13:14 -07003542 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
Tejun Heod41bf8c2017-10-23 16:18:27 -07003543 int ret = 0;
3544
Tejun Heod4ff7492018-04-26 14:29:04 -07003545 cgroup_base_stat_cputime_show(seq);
Tejun Heod41bf8c2017-10-23 16:18:27 -07003546#ifdef CONFIG_CGROUP_SCHED
3547 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3548#endif
3549 return ret;
3550}
3551
Johannes Weiner2ce71352018-10-26 15:06:31 -07003552#ifdef CONFIG_PSI
3553static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3554{
Tejun Heo74321032019-11-04 15:54:30 -08003555 struct cgroup *cgrp = seq_css(seq)->cgroup;
Qian Cai190ecb12020-02-23 22:00:07 -05003556 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
Dan Schatzbergdf5ba5b2019-05-14 15:41:18 -07003557
3558 return psi_show(seq, psi, PSI_IO);
Johannes Weiner2ce71352018-10-26 15:06:31 -07003559}
3560static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3561{
Tejun Heo74321032019-11-04 15:54:30 -08003562 struct cgroup *cgrp = seq_css(seq)->cgroup;
Qian Cai190ecb12020-02-23 22:00:07 -05003563 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
Dan Schatzbergdf5ba5b2019-05-14 15:41:18 -07003564
3565 return psi_show(seq, psi, PSI_MEM);
Johannes Weiner2ce71352018-10-26 15:06:31 -07003566}
3567static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3568{
Tejun Heo74321032019-11-04 15:54:30 -08003569 struct cgroup *cgrp = seq_css(seq)->cgroup;
Qian Cai190ecb12020-02-23 22:00:07 -05003570 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
Dan Schatzbergdf5ba5b2019-05-14 15:41:18 -07003571
3572 return psi_show(seq, psi, PSI_CPU);
Johannes Weiner2ce71352018-10-26 15:06:31 -07003573}
Suren Baghdasaryan0e946822019-05-14 15:41:15 -07003574
3575static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3576 size_t nbytes, enum psi_res res)
3577{
3578 struct psi_trigger *new;
3579 struct cgroup *cgrp;
Odin Ugedal385aac12021-01-16 18:36:33 +01003580 struct psi_group *psi;
Suren Baghdasaryan0e946822019-05-14 15:41:15 -07003581
3582 cgrp = cgroup_kn_lock_live(of->kn, false);
3583 if (!cgrp)
3584 return -ENODEV;
3585
3586 cgroup_get(cgrp);
3587 cgroup_kn_unlock(of->kn);
3588
Odin Ugedal385aac12021-01-16 18:36:33 +01003589 psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3590 new = psi_trigger_create(psi, buf, nbytes, res);
Suren Baghdasaryan0e946822019-05-14 15:41:15 -07003591 if (IS_ERR(new)) {
3592 cgroup_put(cgrp);
3593 return PTR_ERR(new);
3594 }
3595
3596 psi_trigger_replace(&of->priv, new);
3597
3598 cgroup_put(cgrp);
3599
3600 return nbytes;
3601}
3602
3603static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3604 char *buf, size_t nbytes,
3605 loff_t off)
3606{
3607 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3608}
3609
3610static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3611 char *buf, size_t nbytes,
3612 loff_t off)
3613{
3614 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3615}
3616
3617static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3618 char *buf, size_t nbytes,
3619 loff_t off)
3620{
3621 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3622}
3623
3624static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3625 poll_table *pt)
3626{
3627 return psi_trigger_poll(&of->priv, of->file, pt);
3628}
3629
3630static void cgroup_pressure_release(struct kernfs_open_file *of)
3631{
3632 psi_trigger_replace(&of->priv, NULL);
3633}
3634#endif /* CONFIG_PSI */
Johannes Weiner2ce71352018-10-26 15:06:31 -07003635
Roman Gushchin76f969e2019-04-19 10:03:04 -07003636static int cgroup_freeze_show(struct seq_file *seq, void *v)
3637{
3638 struct cgroup *cgrp = seq_css(seq)->cgroup;
3639
3640 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3641
3642 return 0;
3643}
3644
3645static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3646 char *buf, size_t nbytes, loff_t off)
3647{
3648 struct cgroup *cgrp;
3649 ssize_t ret;
3650 int freeze;
3651
3652 ret = kstrtoint(strstrip(buf), 0, &freeze);
3653 if (ret)
3654 return ret;
3655
3656 if (freeze < 0 || freeze > 1)
3657 return -ERANGE;
3658
3659 cgrp = cgroup_kn_lock_live(of->kn, false);
3660 if (!cgrp)
3661 return -ENOENT;
3662
3663 cgroup_freeze(cgrp, freeze);
3664
3665 cgroup_kn_unlock(of->kn);
3666
3667 return nbytes;
3668}
3669
Christian Brauner661ee622021-05-08 14:15:38 +02003670static void __cgroup_kill(struct cgroup *cgrp)
3671{
3672 struct css_task_iter it;
3673 struct task_struct *task;
3674
3675 lockdep_assert_held(&cgroup_mutex);
3676
3677 spin_lock_irq(&css_set_lock);
3678 set_bit(CGRP_KILL, &cgrp->flags);
3679 spin_unlock_irq(&css_set_lock);
3680
3681 css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
3682 while ((task = css_task_iter_next(&it))) {
3683 /* Ignore kernel threads here. */
3684 if (task->flags & PF_KTHREAD)
3685 continue;
3686
3687 /* Skip tasks that are already dying. */
3688 if (__fatal_signal_pending(task))
3689 continue;
3690
3691 send_sig(SIGKILL, task, 0);
3692 }
3693 css_task_iter_end(&it);
3694
3695 spin_lock_irq(&css_set_lock);
3696 clear_bit(CGRP_KILL, &cgrp->flags);
3697 spin_unlock_irq(&css_set_lock);
3698}
3699
3700static void cgroup_kill(struct cgroup *cgrp)
3701{
3702 struct cgroup_subsys_state *css;
3703 struct cgroup *dsct;
3704
3705 lockdep_assert_held(&cgroup_mutex);
3706
3707 cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
3708 __cgroup_kill(dsct);
3709}
3710
3711static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
3712 size_t nbytes, loff_t off)
3713{
3714 ssize_t ret = 0;
3715 int kill;
3716 struct cgroup *cgrp;
3717
3718 ret = kstrtoint(strstrip(buf), 0, &kill);
3719 if (ret)
3720 return ret;
3721
3722 if (kill != 1)
3723 return -ERANGE;
3724
3725 cgrp = cgroup_kn_lock_live(of->kn, false);
3726 if (!cgrp)
3727 return -ENOENT;
3728
3729 /*
3730 * Killing is a process directed operation, i.e. the whole thread-group
3731 * is taken down so act like we do for cgroup.procs and only make this
3732 * writable in non-threaded cgroups.
3733 */
3734 if (cgroup_is_threaded(cgrp))
3735 ret = -EOPNOTSUPP;
3736 else
3737 cgroup_kill(cgrp);
3738
3739 cgroup_kn_unlock(of->kn);
3740
3741 return ret ?: nbytes;
3742}
3743
Tejun Heoe90cbeb2016-12-27 14:49:03 -05003744static int cgroup_file_open(struct kernfs_open_file *of)
3745{
Hui Su5a7b5f32020-11-06 22:47:40 +08003746 struct cftype *cft = of_cft(of);
Tejun Heoe90cbeb2016-12-27 14:49:03 -05003747
3748 if (cft->open)
3749 return cft->open(of);
3750 return 0;
3751}
3752
3753static void cgroup_file_release(struct kernfs_open_file *of)
3754{
Hui Su5a7b5f32020-11-06 22:47:40 +08003755 struct cftype *cft = of_cft(of);
Tejun Heoe90cbeb2016-12-27 14:49:03 -05003756
3757 if (cft->release)
3758 cft->release(of);
3759}
3760
Tejun Heo2bd59d42014-02-11 11:52:49 -05003761static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3762 size_t nbytes, loff_t off)
Paul Menageddbcc7e2007-10-18 23:39:30 -07003763{
Tejun Heo5136f632017-06-27 14:30:28 -04003764 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
Tejun Heo2bd59d42014-02-11 11:52:49 -05003765 struct cgroup *cgrp = of->kn->parent->priv;
Hui Su5a7b5f32020-11-06 22:47:40 +08003766 struct cftype *cft = of_cft(of);
Tejun Heo2bd59d42014-02-11 11:52:49 -05003767 struct cgroup_subsys_state *css;
Tejun Heoa742c592013-12-05 12:28:03 -05003768 int ret;
Paul Menageddbcc7e2007-10-18 23:39:30 -07003769
Jouni Roivas65026da2020-09-30 19:42:42 +03003770 if (!nbytes)
3771 return 0;
3772
Tejun Heo5136f632017-06-27 14:30:28 -04003773 /*
3774 * If namespaces are delegation boundaries, disallow writes to
3775 * files in an non-init namespace root from inside the namespace
3776 * except for the files explicitly marked delegatable -
3777 * cgroup.procs and cgroup.subtree_control.
3778 */
3779 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3780 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3781 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3782 return -EPERM;
3783
Tejun Heob4168642014-05-13 12:16:21 -04003784 if (cft->write)
3785 return cft->write(of, buf, nbytes, off);
3786
Tejun Heo2bd59d42014-02-11 11:52:49 -05003787 /*
3788 * kernfs guarantees that a file isn't deleted with operations in
3789 * flight, which means that the matching css is and stays alive and
3790 * doesn't need to be pinned. The RCU locking is not necessary
3791 * either. It's just for the convenience of using cgroup_css().
3792 */
3793 rcu_read_lock();
3794 css = cgroup_css(cgrp, cft->ss);
3795 rcu_read_unlock();
Paul Menageddbcc7e2007-10-18 23:39:30 -07003796
Tejun Heo451af502014-05-13 12:16:21 -04003797 if (cft->write_u64) {
Tejun Heoa742c592013-12-05 12:28:03 -05003798 unsigned long long v;
3799 ret = kstrtoull(buf, 0, &v);
3800 if (!ret)
3801 ret = cft->write_u64(css, cft, v);
3802 } else if (cft->write_s64) {
3803 long long v;
3804 ret = kstrtoll(buf, 0, &v);
3805 if (!ret)
3806 ret = cft->write_s64(css, cft, v);
Tejun Heoa742c592013-12-05 12:28:03 -05003807 } else {
3808 ret = -EINVAL;
3809 }
Tejun Heo2bd59d42014-02-11 11:52:49 -05003810
Tejun Heoa742c592013-12-05 12:28:03 -05003811 return ret ?: nbytes;
Paul Menageddbcc7e2007-10-18 23:39:30 -07003812}
3813
Johannes Weinerdc505372019-03-05 15:45:48 -08003814static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3815{
Hui Su5a7b5f32020-11-06 22:47:40 +08003816 struct cftype *cft = of_cft(of);
Johannes Weinerdc505372019-03-05 15:45:48 -08003817
3818 if (cft->poll)
3819 return cft->poll(of, pt);
3820
3821 return kernfs_generic_poll(of, pt);
3822}
3823
Tejun Heo6612f052013-12-05 12:28:04 -05003824static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
Paul Menage91796562008-04-29 01:00:01 -07003825{
Tejun Heo2bd59d42014-02-11 11:52:49 -05003826 return seq_cft(seq)->seq_start(seq, ppos);
Tejun Heo6612f052013-12-05 12:28:04 -05003827}
3828
3829static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3830{
Tejun Heo2bd59d42014-02-11 11:52:49 -05003831 return seq_cft(seq)->seq_next(seq, v, ppos);
Tejun Heo6612f052013-12-05 12:28:04 -05003832}
3833
3834static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3835{
Tejun Heoe90cbeb2016-12-27 14:49:03 -05003836 if (seq_cft(seq)->seq_stop)
3837 seq_cft(seq)->seq_stop(seq, v);
Paul Menage91796562008-04-29 01:00:01 -07003838}
3839
3840static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3841{
Tejun Heo7da11272013-12-05 12:28:04 -05003842 struct cftype *cft = seq_cft(m);
3843 struct cgroup_subsys_state *css = seq_css(m);
Li Zefane0798ce2013-07-31 17:36:25 +08003844
Tejun Heo2da8ca82013-12-05 12:28:04 -05003845 if (cft->seq_show)
3846 return cft->seq_show(m, arg);
Paul Menage91796562008-04-29 01:00:01 -07003847
Tejun Heo896f5192013-12-05 12:28:04 -05003848 if (cft->read_u64)
3849 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3850 else if (cft->read_s64)
3851 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3852 else
3853 return -EINVAL;
3854 return 0;
Paul Menage91796562008-04-29 01:00:01 -07003855}
3856
Tejun Heo2bd59d42014-02-11 11:52:49 -05003857static struct kernfs_ops cgroup_kf_single_ops = {
3858 .atomic_write_len = PAGE_SIZE,
Tejun Heoe90cbeb2016-12-27 14:49:03 -05003859 .open = cgroup_file_open,
3860 .release = cgroup_file_release,
Tejun Heo2bd59d42014-02-11 11:52:49 -05003861 .write = cgroup_file_write,
Johannes Weinerdc505372019-03-05 15:45:48 -08003862 .poll = cgroup_file_poll,
Tejun Heo2bd59d42014-02-11 11:52:49 -05003863 .seq_show = cgroup_seqfile_show,
Paul Menage91796562008-04-29 01:00:01 -07003864};
3865
Tejun Heo2bd59d42014-02-11 11:52:49 -05003866static struct kernfs_ops cgroup_kf_ops = {
3867 .atomic_write_len = PAGE_SIZE,
Tejun Heoe90cbeb2016-12-27 14:49:03 -05003868 .open = cgroup_file_open,
3869 .release = cgroup_file_release,
Tejun Heo2bd59d42014-02-11 11:52:49 -05003870 .write = cgroup_file_write,
Johannes Weinerdc505372019-03-05 15:45:48 -08003871 .poll = cgroup_file_poll,
Tejun Heo2bd59d42014-02-11 11:52:49 -05003872 .seq_start = cgroup_seqfile_start,
3873 .seq_next = cgroup_seqfile_next,
3874 .seq_stop = cgroup_seqfile_stop,
3875 .seq_show = cgroup_seqfile_show,
3876};
Paul Menageddbcc7e2007-10-18 23:39:30 -07003877
Tejun Heo49957f82014-04-07 16:44:47 -04003878/* set uid and gid of cgroup dirs and files to that of the creator */
3879static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3880{
3881 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3882 .ia_uid = current_fsuid(),
3883 .ia_gid = current_fsgid(), };
3884
3885 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3886 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3887 return 0;
3888
3889 return kernfs_setattr(kn, &iattr);
3890}
3891
Tejun Heob12e3582018-04-26 14:29:04 -07003892static void cgroup_file_notify_timer(struct timer_list *timer)
3893{
3894 cgroup_file_notify(container_of(timer, struct cgroup_file,
3895 notify_timer));
3896}
3897
Tejun Heo4df8dc92015-09-18 17:54:23 -04003898static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3899 struct cftype *cft)
Paul Menageddbcc7e2007-10-18 23:39:30 -07003900{
Tejun Heo8d7e6fb2014-02-11 11:52:48 -05003901 char name[CGROUP_FILE_NAME_MAX];
Tejun Heo2bd59d42014-02-11 11:52:49 -05003902 struct kernfs_node *kn;
3903 struct lock_class_key *key = NULL;
Tejun Heo49957f82014-04-07 16:44:47 -04003904 int ret;
Tejun Heo8e3f6542012-04-01 12:09:55 -07003905
Tejun Heo2bd59d42014-02-11 11:52:49 -05003906#ifdef CONFIG_DEBUG_LOCK_ALLOC
3907 key = &cft->lockdep_key;
3908#endif
3909 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
Dmitry Torokhov488dee92018-07-20 21:56:47 +00003910 cgroup_file_mode(cft),
3911 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
3912 0, cft->kf_ops, cft,
Tejun Heodfeb07502015-02-13 14:36:31 -08003913 NULL, key);
Tejun Heo49957f82014-04-07 16:44:47 -04003914 if (IS_ERR(kn))
3915 return PTR_ERR(kn);
3916
3917 ret = cgroup_kn_set_ugid(kn);
Tejun Heof8f22e52014-04-23 11:13:16 -04003918 if (ret) {
Tejun Heo49957f82014-04-07 16:44:47 -04003919 kernfs_remove(kn);
Tejun Heof8f22e52014-04-23 11:13:16 -04003920 return ret;
3921 }
3922
Tejun Heo6f60ead2015-09-18 17:54:23 -04003923 if (cft->file_offset) {
3924 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3925
Tejun Heob12e3582018-04-26 14:29:04 -07003926 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3927
Tejun Heo34c06252015-11-05 00:12:24 -05003928 spin_lock_irq(&cgroup_file_kn_lock);
Tejun Heo6f60ead2015-09-18 17:54:23 -04003929 cfile->kn = kn;
Tejun Heo34c06252015-11-05 00:12:24 -05003930 spin_unlock_irq(&cgroup_file_kn_lock);
Tejun Heo6f60ead2015-09-18 17:54:23 -04003931 }
3932
Tejun Heof8f22e52014-04-23 11:13:16 -04003933 return 0;
Paul Menageddbcc7e2007-10-18 23:39:30 -07003934}
3935
Tejun Heob1f28d32013-06-28 16:24:10 -07003936/**
3937 * cgroup_addrm_files - add or remove files to a cgroup directory
Tejun Heo4df8dc92015-09-18 17:54:23 -04003938 * @css: the target css
3939 * @cgrp: the target cgroup (usually css->cgroup)
Tejun Heob1f28d32013-06-28 16:24:10 -07003940 * @cfts: array of cftypes to be added
3941 * @is_add: whether to add or remove
3942 *
3943 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
Tejun Heo6732ed82015-09-18 17:54:23 -04003944 * For removals, this function never fails.
Tejun Heob1f28d32013-06-28 16:24:10 -07003945 */
Tejun Heo4df8dc92015-09-18 17:54:23 -04003946static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3947 struct cgroup *cgrp, struct cftype cfts[],
Tejun Heo2bb566c2013-08-08 20:11:23 -04003948 bool is_add)
Paul Menageddbcc7e2007-10-18 23:39:30 -07003949{
Tejun Heo6732ed82015-09-18 17:54:23 -04003950 struct cftype *cft, *cft_end = NULL;
Tejun Heob598dde2016-02-22 22:25:45 -05003951 int ret = 0;
Tejun Heob1f28d32013-06-28 16:24:10 -07003952
Tejun Heo01f64742014-05-13 12:19:23 -04003953 lockdep_assert_held(&cgroup_mutex);
Tejun Heodb0416b2012-04-01 12:09:55 -07003954
Tejun Heo6732ed82015-09-18 17:54:23 -04003955restart:
3956 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
Gao fengf33fddc2012-12-06 14:38:57 +08003957 /* does cft->flags tell us to skip this file on @cgrp? */
Tejun Heo05ebb6e2014-07-15 11:05:10 -04003958 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
Tejun Heo8cbbf2c2014-03-19 10:23:55 -04003959 continue;
Tejun Heo05ebb6e2014-07-15 11:05:10 -04003960 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
Tejun Heo873fe092013-04-14 20:15:26 -07003961 continue;
Tejun Heod51f39b2014-05-16 13:22:48 -04003962 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
Gao fengf33fddc2012-12-06 14:38:57 +08003963 continue;
Tejun Heod51f39b2014-05-16 13:22:48 -04003964 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
Gao fengf33fddc2012-12-06 14:38:57 +08003965 continue;
Waiman Long5cf81142018-11-08 10:08:46 -05003966 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
3967 continue;
Li Zefan2739d3c2013-01-21 18:18:33 +08003968 if (is_add) {
Tejun Heo4df8dc92015-09-18 17:54:23 -04003969 ret = cgroup_add_file(css, cgrp, cft);
Tejun Heob1f28d32013-06-28 16:24:10 -07003970 if (ret) {
Joe Perchesed3d2612014-04-25 18:28:03 -04003971 pr_warn("%s: failed to add %s, err=%d\n",
3972 __func__, cft->name, ret);
Tejun Heo6732ed82015-09-18 17:54:23 -04003973 cft_end = cft;
3974 is_add = false;
3975 goto restart;
Tejun Heob1f28d32013-06-28 16:24:10 -07003976 }
Li Zefan2739d3c2013-01-21 18:18:33 +08003977 } else {
3978 cgroup_rm_file(cgrp, cft);
Tejun Heodb0416b2012-04-01 12:09:55 -07003979 }
Paul Menageddbcc7e2007-10-18 23:39:30 -07003980 }
Tejun Heob598dde2016-02-22 22:25:45 -05003981 return ret;
Paul Menageddbcc7e2007-10-18 23:39:30 -07003982}
3983
Tejun Heo21a2d342014-02-12 09:29:49 -05003984static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
Tejun Heo8e3f6542012-04-01 12:09:55 -07003985{
Tejun Heo2bb566c2013-08-08 20:11:23 -04003986 struct cgroup_subsys *ss = cfts[0].ss;
Tejun Heo3dd06ff2014-03-19 10:23:54 -04003987 struct cgroup *root = &ss->root->cgrp;
Tejun Heo492eb212013-08-08 20:11:25 -04003988 struct cgroup_subsys_state *css;
Tejun Heo9ccece82013-06-28 16:24:11 -07003989 int ret = 0;
Tejun Heo8e3f6542012-04-01 12:09:55 -07003990
Tejun Heo01f64742014-05-13 12:19:23 -04003991 lockdep_assert_held(&cgroup_mutex);
Li Zefane8c82d22013-06-18 18:48:37 +08003992
Li Zefane8c82d22013-06-18 18:48:37 +08003993 /* add/rm files for all cgroups created before */
Tejun Heoca8bdca2013-08-26 18:40:56 -04003994 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
Tejun Heo492eb212013-08-08 20:11:25 -04003995 struct cgroup *cgrp = css->cgroup;
3996
Tejun Heo88cb04b2016-03-03 09:57:58 -05003997 if (!(css->flags & CSS_VISIBLE))
Li Zefane8c82d22013-06-18 18:48:37 +08003998 continue;
3999
Tejun Heo4df8dc92015-09-18 17:54:23 -04004000 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
Tejun Heo9ccece82013-06-28 16:24:11 -07004001 if (ret)
4002 break;
Tejun Heo8e3f6542012-04-01 12:09:55 -07004003 }
Tejun Heo21a2d342014-02-12 09:29:49 -05004004
4005 if (is_add && !ret)
4006 kernfs_activate(root->kn);
Tejun Heo9ccece82013-06-28 16:24:11 -07004007 return ret;
Tejun Heo8e3f6542012-04-01 12:09:55 -07004008}
4009
Tejun Heo2da440a2014-02-11 11:52:48 -05004010static void cgroup_exit_cftypes(struct cftype *cfts)
4011{
4012 struct cftype *cft;
4013
Tejun Heo2bd59d42014-02-11 11:52:49 -05004014 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4015 /* free copy for custom atomic_write_len, see init_cftypes() */
4016 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
4017 kfree(cft->kf_ops);
4018 cft->kf_ops = NULL;
Tejun Heo2da440a2014-02-11 11:52:48 -05004019 cft->ss = NULL;
Tejun Heoa8ddc822014-07-15 11:05:10 -04004020
4021 /* revert flags set by cgroup core while adding @cfts */
Tejun Heo05ebb6e2014-07-15 11:05:10 -04004022 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
Tejun Heo2bd59d42014-02-11 11:52:49 -05004023 }
Tejun Heo2da440a2014-02-11 11:52:48 -05004024}
4025
Tejun Heo2bd59d42014-02-11 11:52:49 -05004026static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
Tejun Heo2da440a2014-02-11 11:52:48 -05004027{
4028 struct cftype *cft;
4029
Tejun Heo2bd59d42014-02-11 11:52:49 -05004030 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4031 struct kernfs_ops *kf_ops;
4032
Tejun Heo0adb0702014-02-12 09:29:48 -05004033 WARN_ON(cft->ss || cft->kf_ops);
4034
Tejun Heo2bd59d42014-02-11 11:52:49 -05004035 if (cft->seq_start)
4036 kf_ops = &cgroup_kf_ops;
4037 else
4038 kf_ops = &cgroup_kf_single_ops;
4039
4040 /*
4041 * Ugh... if @cft wants a custom max_write_len, we need to
4042 * make a copy of kf_ops to set its atomic_write_len.
4043 */
4044 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
4045 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
4046 if (!kf_ops) {
4047 cgroup_exit_cftypes(cfts);
4048 return -ENOMEM;
4049 }
4050 kf_ops->atomic_write_len = cft->max_write_len;
4051 }
4052
4053 cft->kf_ops = kf_ops;
Tejun Heo2da440a2014-02-11 11:52:48 -05004054 cft->ss = ss;
Tejun Heo2bd59d42014-02-11 11:52:49 -05004055 }
4056
4057 return 0;
Tejun Heo2da440a2014-02-11 11:52:48 -05004058}
4059
Tejun Heo21a2d342014-02-12 09:29:49 -05004060static int cgroup_rm_cftypes_locked(struct cftype *cfts)
4061{
Tejun Heo01f64742014-05-13 12:19:23 -04004062 lockdep_assert_held(&cgroup_mutex);
Tejun Heo21a2d342014-02-12 09:29:49 -05004063
4064 if (!cfts || !cfts[0].ss)
4065 return -ENOENT;
4066
4067 list_del(&cfts->node);
4068 cgroup_apply_cftypes(cfts, false);
4069 cgroup_exit_cftypes(cfts);
4070 return 0;
4071}
4072
Tejun Heo8e3f6542012-04-01 12:09:55 -07004073/**
Tejun Heo80b13582014-02-12 09:29:48 -05004074 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
4075 * @cfts: zero-length name terminated array of cftypes
4076 *
4077 * Unregister @cfts. Files described by @cfts are removed from all
4078 * existing cgroups and all future cgroups won't have them either. This
4079 * function can be called anytime whether @cfts' subsys is attached or not.
4080 *
4081 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
4082 * registered.
4083 */
4084int cgroup_rm_cftypes(struct cftype *cfts)
4085{
Tejun Heo21a2d342014-02-12 09:29:49 -05004086 int ret;
Tejun Heo80b13582014-02-12 09:29:48 -05004087
Tejun Heo01f64742014-05-13 12:19:23 -04004088 mutex_lock(&cgroup_mutex);
Tejun Heo21a2d342014-02-12 09:29:49 -05004089 ret = cgroup_rm_cftypes_locked(cfts);
Tejun Heo01f64742014-05-13 12:19:23 -04004090 mutex_unlock(&cgroup_mutex);
Tejun Heo8e3f6542012-04-01 12:09:55 -07004091 return ret;
4092}
4093
4094/**
4095 * cgroup_add_cftypes - add an array of cftypes to a subsystem
4096 * @ss: target cgroup subsystem
4097 * @cfts: zero-length name terminated array of cftypes
4098 *
4099 * Register @cfts to @ss. Files described by @cfts are created for all
4100 * existing cgroups to which @ss is attached and all future cgroups will
4101 * have them too. This function can be called anytime whether @ss is
4102 * attached or not.
4103 *
4104 * Returns 0 on successful registration, -errno on failure. Note that this
4105 * function currently returns 0 as long as @cfts registration is successful
4106 * even if some file creation attempts on existing cgroups fail.
4107 */
Tejun Heo2cf669a2014-07-15 11:05:09 -04004108static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
Tejun Heo8e3f6542012-04-01 12:09:55 -07004109{
Tejun Heo9ccece82013-06-28 16:24:11 -07004110 int ret;
Tejun Heo8e3f6542012-04-01 12:09:55 -07004111
Tejun Heofc5ed1e2015-09-18 11:56:28 -04004112 if (!cgroup_ssid_enabled(ss->id))
Li Zefanc731ae12014-06-05 17:16:30 +08004113 return 0;
4114
Li Zefandc5736e2014-02-17 10:41:50 +08004115 if (!cfts || cfts[0].name[0] == '\0')
4116 return 0;
Tejun Heo8e3f6542012-04-01 12:09:55 -07004117
Tejun Heo2bd59d42014-02-11 11:52:49 -05004118 ret = cgroup_init_cftypes(ss, cfts);
Tejun Heo9ccece82013-06-28 16:24:11 -07004119 if (ret)
Tejun Heo2bd59d42014-02-11 11:52:49 -05004120 return ret;
Tejun Heo8e3f6542012-04-01 12:09:55 -07004121
Tejun Heo01f64742014-05-13 12:19:23 -04004122 mutex_lock(&cgroup_mutex);
Tejun Heo21a2d342014-02-12 09:29:49 -05004123
Tejun Heo0adb0702014-02-12 09:29:48 -05004124 list_add_tail(&cfts->node, &ss->cfts);
Tejun Heo21a2d342014-02-12 09:29:49 -05004125 ret = cgroup_apply_cftypes(cfts, true);
Tejun Heo9ccece82013-06-28 16:24:11 -07004126 if (ret)
Tejun Heo21a2d342014-02-12 09:29:49 -05004127 cgroup_rm_cftypes_locked(cfts);
4128
Tejun Heo01f64742014-05-13 12:19:23 -04004129 mutex_unlock(&cgroup_mutex);
Tejun Heo9ccece82013-06-28 16:24:11 -07004130 return ret;
Tejun Heo8e3f6542012-04-01 12:09:55 -07004131}
Tejun Heo79578622012-04-01 12:09:56 -07004132
4133/**
Tejun Heoa8ddc822014-07-15 11:05:10 -04004134 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
4135 * @ss: target cgroup subsystem
4136 * @cfts: zero-length name terminated array of cftypes
4137 *
4138 * Similar to cgroup_add_cftypes() but the added files are only used for
4139 * the default hierarchy.
4140 */
4141int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4142{
4143 struct cftype *cft;
4144
4145 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
Tejun Heo05ebb6e2014-07-15 11:05:10 -04004146 cft->flags |= __CFTYPE_ONLY_ON_DFL;
Tejun Heoa8ddc822014-07-15 11:05:10 -04004147 return cgroup_add_cftypes(ss, cfts);
4148}
4149
4150/**
4151 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
4152 * @ss: target cgroup subsystem
4153 * @cfts: zero-length name terminated array of cftypes
4154 *
4155 * Similar to cgroup_add_cftypes() but the added files are only used for
4156 * the legacy hierarchies.
4157 */
Tejun Heo2cf669a2014-07-15 11:05:09 -04004158int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4159{
Tejun Heoa8ddc822014-07-15 11:05:10 -04004160 struct cftype *cft;
4161
Tejun Heoe4b70372015-10-15 17:00:43 -04004162 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4163 cft->flags |= __CFTYPE_NOT_ON_DFL;
Tejun Heo2cf669a2014-07-15 11:05:09 -04004164 return cgroup_add_cftypes(ss, cfts);
4165}
4166
Li Zefana043e3b2008-02-23 15:24:09 -08004167/**
Tejun Heo34c06252015-11-05 00:12:24 -05004168 * cgroup_file_notify - generate a file modified event for a cgroup_file
4169 * @cfile: target cgroup_file
4170 *
4171 * @cfile must have been obtained by setting cftype->file_offset.
4172 */
4173void cgroup_file_notify(struct cgroup_file *cfile)
4174{
4175 unsigned long flags;
4176
4177 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
Tejun Heob12e3582018-04-26 14:29:04 -07004178 if (cfile->kn) {
4179 unsigned long last = cfile->notified_at;
4180 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4181
4182 if (time_in_range(jiffies, last, next)) {
4183 timer_reduce(&cfile->notify_timer, next);
4184 } else {
4185 kernfs_notify(cfile->kn);
4186 cfile->notified_at = jiffies;
4187 }
4188 }
Tejun Heo34c06252015-11-05 00:12:24 -05004189 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4190}
4191
4192/**
Tejun Heo492eb212013-08-08 20:11:25 -04004193 * css_next_child - find the next child of a given css
Tejun Heoc2931b72014-05-16 13:22:51 -04004194 * @pos: the current position (%NULL to initiate traversal)
4195 * @parent: css whose children to walk
Tejun Heo53fa5262013-05-24 10:55:38 +09004196 *
Tejun Heoc2931b72014-05-16 13:22:51 -04004197 * This function returns the next child of @parent and should be called
Tejun Heo87fb54f2013-12-06 15:11:55 -05004198 * under either cgroup_mutex or RCU read lock. The only requirement is
Tejun Heoc2931b72014-05-16 13:22:51 -04004199 * that @parent and @pos are accessible. The next sibling is guaranteed to
4200 * be returned regardless of their states.
4201 *
4202 * If a subsystem synchronizes ->css_online() and the start of iteration, a
4203 * css which finished ->css_online() is guaranteed to be visible in the
4204 * future iterations and will stay visible until the last reference is put.
4205 * A css which hasn't finished ->css_online() or already finished
4206 * ->css_offline() may show up during traversal. It's each subsystem's
4207 * responsibility to synchronize against on/offlining.
Tejun Heo53fa5262013-05-24 10:55:38 +09004208 */
Tejun Heoc2931b72014-05-16 13:22:51 -04004209struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4210 struct cgroup_subsys_state *parent)
Tejun Heo53fa5262013-05-24 10:55:38 +09004211{
Tejun Heoc2931b72014-05-16 13:22:51 -04004212 struct cgroup_subsys_state *next;
Tejun Heo53fa5262013-05-24 10:55:38 +09004213
Tejun Heo8353da12014-05-13 12:19:23 -04004214 cgroup_assert_mutex_or_rcu_locked();
Tejun Heo53fa5262013-05-24 10:55:38 +09004215
4216 /*
Tejun Heode3f0342014-05-16 13:22:49 -04004217 * @pos could already have been unlinked from the sibling list.
4218 * Once a cgroup is removed, its ->sibling.next is no longer
4219 * updated when its next sibling changes. CSS_RELEASED is set when
4220 * @pos is taken off list, at which time its next pointer is valid,
4221 * and, as releases are serialized, the one pointed to by the next
4222 * pointer is guaranteed to not have started release yet. This
4223 * implies that if we observe !CSS_RELEASED on @pos in this RCU
4224 * critical section, the one pointed to by its next pointer is
4225 * guaranteed to not have finished its RCU grace period even if we
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +05304226 * have dropped rcu_read_lock() in-between iterations.
Tejun Heo3b287a52013-08-08 20:11:24 -04004227 *
Tejun Heode3f0342014-05-16 13:22:49 -04004228 * If @pos has CSS_RELEASED set, its next pointer can't be
4229 * dereferenced; however, as each css is given a monotonically
4230 * increasing unique serial number and always appended to the
4231 * sibling list, the next one can be found by walking the parent's
4232 * children until the first css with higher serial number than
4233 * @pos's. While this path can be slower, it happens iff iteration
4234 * races against release and the race window is very small.
Tejun Heo53fa5262013-05-24 10:55:38 +09004235 */
Tejun Heo3b287a52013-08-08 20:11:24 -04004236 if (!pos) {
Tejun Heoc2931b72014-05-16 13:22:51 -04004237 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4238 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4239 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
Tejun Heo3b287a52013-08-08 20:11:24 -04004240 } else {
Madhuparna Bhowmik3010c5b2020-01-18 08:40:51 +05304241 list_for_each_entry_rcu(next, &parent->children, sibling,
4242 lockdep_is_held(&cgroup_mutex))
Tejun Heo3b287a52013-08-08 20:11:24 -04004243 if (next->serial_nr > pos->serial_nr)
4244 break;
Tejun Heo53fa5262013-05-24 10:55:38 +09004245 }
4246
Tejun Heo3b281af2014-04-23 11:13:15 -04004247 /*
4248 * @next, if not pointing to the head, can be dereferenced and is
Tejun Heoc2931b72014-05-16 13:22:51 -04004249 * the next sibling.
Tejun Heo3b281af2014-04-23 11:13:15 -04004250 */
Tejun Heoc2931b72014-05-16 13:22:51 -04004251 if (&next->sibling != &parent->children)
4252 return next;
Tejun Heo3b281af2014-04-23 11:13:15 -04004253 return NULL;
Tejun Heo53fa5262013-05-24 10:55:38 +09004254}
Tejun Heo53fa5262013-05-24 10:55:38 +09004255
4256/**
Tejun Heo492eb212013-08-08 20:11:25 -04004257 * css_next_descendant_pre - find the next descendant for pre-order walk
Tejun Heo574bd9f2012-11-09 09:12:29 -08004258 * @pos: the current position (%NULL to initiate traversal)
Tejun Heo492eb212013-08-08 20:11:25 -04004259 * @root: css whose descendants to walk
Tejun Heo574bd9f2012-11-09 09:12:29 -08004260 *
Tejun Heo492eb212013-08-08 20:11:25 -04004261 * To be used by css_for_each_descendant_pre(). Find the next descendant
Tejun Heobd8815a2013-08-08 20:11:27 -04004262 * to visit for pre-order traversal of @root's descendants. @root is
4263 * included in the iteration and the first node to be visited.
Tejun Heo75501a62013-05-24 10:55:38 +09004264 *
Tejun Heo87fb54f2013-12-06 15:11:55 -05004265 * While this function requires cgroup_mutex or RCU read locking, it
4266 * doesn't require the whole traversal to be contained in a single critical
4267 * section. This function will return the correct next descendant as long
4268 * as both @pos and @root are accessible and @pos is a descendant of @root.
Tejun Heoc2931b72014-05-16 13:22:51 -04004269 *
4270 * If a subsystem synchronizes ->css_online() and the start of iteration, a
4271 * css which finished ->css_online() is guaranteed to be visible in the
4272 * future iterations and will stay visible until the last reference is put.
4273 * A css which hasn't finished ->css_online() or already finished
4274 * ->css_offline() may show up during traversal. It's each subsystem's
4275 * responsibility to synchronize against on/offlining.
Tejun Heo574bd9f2012-11-09 09:12:29 -08004276 */
Tejun Heo492eb212013-08-08 20:11:25 -04004277struct cgroup_subsys_state *
4278css_next_descendant_pre(struct cgroup_subsys_state *pos,
4279 struct cgroup_subsys_state *root)
Tejun Heo574bd9f2012-11-09 09:12:29 -08004280{
Tejun Heo492eb212013-08-08 20:11:25 -04004281 struct cgroup_subsys_state *next;
Tejun Heo574bd9f2012-11-09 09:12:29 -08004282
Tejun Heo8353da12014-05-13 12:19:23 -04004283 cgroup_assert_mutex_or_rcu_locked();
Tejun Heo574bd9f2012-11-09 09:12:29 -08004284
Tejun Heobd8815a2013-08-08 20:11:27 -04004285 /* if first iteration, visit @root */
Tejun Heo7805d002013-05-24 10:50:24 +09004286 if (!pos)
Tejun Heobd8815a2013-08-08 20:11:27 -04004287 return root;
Tejun Heo574bd9f2012-11-09 09:12:29 -08004288
4289 /* visit the first child if exists */
Tejun Heo492eb212013-08-08 20:11:25 -04004290 next = css_next_child(NULL, pos);
Tejun Heo574bd9f2012-11-09 09:12:29 -08004291 if (next)
4292 return next;
4293
4294 /* no child, visit my or the closest ancestor's next sibling */
Tejun Heo492eb212013-08-08 20:11:25 -04004295 while (pos != root) {
Tejun Heo5c9d5352014-05-16 13:22:48 -04004296 next = css_next_child(pos, pos->parent);
Tejun Heo75501a62013-05-24 10:55:38 +09004297 if (next)
Tejun Heo574bd9f2012-11-09 09:12:29 -08004298 return next;
Tejun Heo5c9d5352014-05-16 13:22:48 -04004299 pos = pos->parent;
Tejun Heo7805d002013-05-24 10:50:24 +09004300 }
Tejun Heo574bd9f2012-11-09 09:12:29 -08004301
4302 return NULL;
4303}
Christoph Hellwig474a2802019-06-21 10:22:48 +02004304EXPORT_SYMBOL_GPL(css_next_descendant_pre);
Tejun Heo574bd9f2012-11-09 09:12:29 -08004305
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004306/**
Tejun Heo492eb212013-08-08 20:11:25 -04004307 * css_rightmost_descendant - return the rightmost descendant of a css
4308 * @pos: css of interest
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004309 *
Tejun Heo492eb212013-08-08 20:11:25 -04004310 * Return the rightmost descendant of @pos. If there's no descendant, @pos
4311 * is returned. This can be used during pre-order traversal to skip
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004312 * subtree of @pos.
Tejun Heo75501a62013-05-24 10:55:38 +09004313 *
Tejun Heo87fb54f2013-12-06 15:11:55 -05004314 * While this function requires cgroup_mutex or RCU read locking, it
4315 * doesn't require the whole traversal to be contained in a single critical
4316 * section. This function will return the correct rightmost descendant as
4317 * long as @pos is accessible.
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004318 */
Tejun Heo492eb212013-08-08 20:11:25 -04004319struct cgroup_subsys_state *
4320css_rightmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004321{
Tejun Heo492eb212013-08-08 20:11:25 -04004322 struct cgroup_subsys_state *last, *tmp;
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004323
Tejun Heo8353da12014-05-13 12:19:23 -04004324 cgroup_assert_mutex_or_rcu_locked();
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004325
4326 do {
4327 last = pos;
4328 /* ->prev isn't RCU safe, walk ->next till the end */
4329 pos = NULL;
Tejun Heo492eb212013-08-08 20:11:25 -04004330 css_for_each_child(tmp, last)
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004331 pos = tmp;
4332 } while (pos);
4333
4334 return last;
4335}
Tejun Heo12a9d2f2013-01-07 08:49:33 -08004336
Tejun Heo492eb212013-08-08 20:11:25 -04004337static struct cgroup_subsys_state *
4338css_leftmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo574bd9f2012-11-09 09:12:29 -08004339{
Tejun Heo492eb212013-08-08 20:11:25 -04004340 struct cgroup_subsys_state *last;
Tejun Heo574bd9f2012-11-09 09:12:29 -08004341
4342 do {
4343 last = pos;
Tejun Heo492eb212013-08-08 20:11:25 -04004344 pos = css_next_child(NULL, pos);
Tejun Heo574bd9f2012-11-09 09:12:29 -08004345 } while (pos);
4346
4347 return last;
4348}
4349
4350/**
Tejun Heo492eb212013-08-08 20:11:25 -04004351 * css_next_descendant_post - find the next descendant for post-order walk
Tejun Heo574bd9f2012-11-09 09:12:29 -08004352 * @pos: the current position (%NULL to initiate traversal)
Tejun Heo492eb212013-08-08 20:11:25 -04004353 * @root: css whose descendants to walk
Tejun Heo574bd9f2012-11-09 09:12:29 -08004354 *
Tejun Heo492eb212013-08-08 20:11:25 -04004355 * To be used by css_for_each_descendant_post(). Find the next descendant
Tejun Heobd8815a2013-08-08 20:11:27 -04004356 * to visit for post-order traversal of @root's descendants. @root is
4357 * included in the iteration and the last node to be visited.
Tejun Heo75501a62013-05-24 10:55:38 +09004358 *
Tejun Heo87fb54f2013-12-06 15:11:55 -05004359 * While this function requires cgroup_mutex or RCU read locking, it
4360 * doesn't require the whole traversal to be contained in a single critical
4361 * section. This function will return the correct next descendant as long
4362 * as both @pos and @cgroup are accessible and @pos is a descendant of
4363 * @cgroup.
Tejun Heoc2931b72014-05-16 13:22:51 -04004364 *
4365 * If a subsystem synchronizes ->css_online() and the start of iteration, a
4366 * css which finished ->css_online() is guaranteed to be visible in the
4367 * future iterations and will stay visible until the last reference is put.
4368 * A css which hasn't finished ->css_online() or already finished
4369 * ->css_offline() may show up during traversal. It's each subsystem's
4370 * responsibility to synchronize against on/offlining.
Tejun Heo574bd9f2012-11-09 09:12:29 -08004371 */
Tejun Heo492eb212013-08-08 20:11:25 -04004372struct cgroup_subsys_state *
4373css_next_descendant_post(struct cgroup_subsys_state *pos,
4374 struct cgroup_subsys_state *root)
Tejun Heo574bd9f2012-11-09 09:12:29 -08004375{
Tejun Heo492eb212013-08-08 20:11:25 -04004376 struct cgroup_subsys_state *next;
Tejun Heo574bd9f2012-11-09 09:12:29 -08004377
Tejun Heo8353da12014-05-13 12:19:23 -04004378 cgroup_assert_mutex_or_rcu_locked();
Tejun Heo574bd9f2012-11-09 09:12:29 -08004379
Tejun Heo58b79a92013-09-06 15:31:08 -04004380 /* if first iteration, visit leftmost descendant which may be @root */
4381 if (!pos)
4382 return css_leftmost_descendant(root);
Tejun Heo574bd9f2012-11-09 09:12:29 -08004383
Tejun Heobd8815a2013-08-08 20:11:27 -04004384 /* if we visited @root, we're done */
4385 if (pos == root)
4386 return NULL;
4387
Tejun Heo574bd9f2012-11-09 09:12:29 -08004388 /* if there's an unvisited sibling, visit its leftmost descendant */
Tejun Heo5c9d5352014-05-16 13:22:48 -04004389 next = css_next_child(pos, pos->parent);
Tejun Heo75501a62013-05-24 10:55:38 +09004390 if (next)
Tejun Heo492eb212013-08-08 20:11:25 -04004391 return css_leftmost_descendant(next);
Tejun Heo574bd9f2012-11-09 09:12:29 -08004392
4393 /* no sibling left, visit parent */
Tejun Heo5c9d5352014-05-16 13:22:48 -04004394 return pos->parent;
Tejun Heo574bd9f2012-11-09 09:12:29 -08004395}
Tejun Heo574bd9f2012-11-09 09:12:29 -08004396
Tejun Heof3d46502014-05-16 13:22:52 -04004397/**
4398 * css_has_online_children - does a css have online children
4399 * @css: the target css
4400 *
4401 * Returns %true if @css has any online children; otherwise, %false. This
4402 * function can be called from any context but the caller is responsible
4403 * for synchronizing against on/offlining as necessary.
4404 */
4405bool css_has_online_children(struct cgroup_subsys_state *css)
Tejun Heocbc125e2014-05-14 09:15:01 -04004406{
Tejun Heof3d46502014-05-16 13:22:52 -04004407 struct cgroup_subsys_state *child;
4408 bool ret = false;
Tejun Heocbc125e2014-05-14 09:15:01 -04004409
4410 rcu_read_lock();
Tejun Heof3d46502014-05-16 13:22:52 -04004411 css_for_each_child(child, css) {
Li Zefan99bae5f2014-06-12 14:31:31 +08004412 if (child->flags & CSS_ONLINE) {
Tejun Heof3d46502014-05-16 13:22:52 -04004413 ret = true;
4414 break;
Tejun Heocbc125e2014-05-14 09:15:01 -04004415 }
4416 }
4417 rcu_read_unlock();
Tejun Heof3d46502014-05-16 13:22:52 -04004418 return ret;
Cliff Wickman31a7df02008-02-07 00:14:42 -08004419}
4420
Tejun Heo450ee0c2017-05-15 09:34:03 -04004421static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4422{
4423 struct list_head *l;
4424 struct cgrp_cset_link *link;
4425 struct css_set *cset;
4426
4427 lockdep_assert_held(&css_set_lock);
4428
4429 /* find the next threaded cset */
4430 if (it->tcset_pos) {
4431 l = it->tcset_pos->next;
4432
4433 if (l != it->tcset_head) {
4434 it->tcset_pos = l;
4435 return container_of(l, struct css_set,
4436 threaded_csets_node);
4437 }
4438
4439 it->tcset_pos = NULL;
4440 }
4441
4442 /* find the next cset */
4443 l = it->cset_pos;
4444 l = l->next;
4445 if (l == it->cset_head) {
4446 it->cset_pos = NULL;
4447 return NULL;
4448 }
4449
4450 if (it->ss) {
4451 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4452 } else {
4453 link = list_entry(l, struct cgrp_cset_link, cset_link);
4454 cset = link->cset;
4455 }
4456
4457 it->cset_pos = l;
4458
4459 /* initialize threaded css_set walking */
4460 if (it->flags & CSS_TASK_ITER_THREADED) {
4461 if (it->cur_dcset)
4462 put_css_set_locked(it->cur_dcset);
4463 it->cur_dcset = cset;
4464 get_css_set(cset);
4465
4466 it->tcset_head = &cset->threaded_csets;
4467 it->tcset_pos = &cset->threaded_csets;
4468 }
4469
4470 return cset;
4471}
4472
Tejun Heo0942eee2013-08-08 20:11:26 -04004473/**
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +05304474 * css_task_iter_advance_css_set - advance a task iterator to the next css_set
Tejun Heo0942eee2013-08-08 20:11:26 -04004475 * @it: the iterator to advance
4476 *
4477 * Advance @it to the next css_set to walk.
Tejun Heod5158762013-08-08 20:11:26 -04004478 */
Tejun Heoecb9d532015-10-15 16:41:52 -04004479static void css_task_iter_advance_css_set(struct css_task_iter *it)
Tejun Heod5158762013-08-08 20:11:26 -04004480{
Tejun Heod5158762013-08-08 20:11:26 -04004481 struct css_set *cset;
4482
Tejun Heof0d9a5f2015-10-15 16:41:53 -04004483 lockdep_assert_held(&css_set_lock);
Tejun Heoed27b9f2015-10-15 16:41:52 -04004484
Michal Koutnýf43caa22020-01-24 12:40:16 +01004485 /* Advance to the next non-empty css_set and find first non-empty tasks list*/
4486 while ((cset = css_task_iter_next_css_set(it))) {
4487 if (!list_empty(&cset->tasks)) {
4488 it->cur_tasks_head = &cset->tasks;
4489 break;
4490 } else if (!list_empty(&cset->mg_tasks)) {
4491 it->cur_tasks_head = &cset->mg_tasks;
4492 break;
4493 } else if (!list_empty(&cset->dying_tasks)) {
4494 it->cur_tasks_head = &cset->dying_tasks;
4495 break;
Tejun Heod5158762013-08-08 20:11:26 -04004496 }
Michal Koutný9c974c72020-01-24 12:40:15 +01004497 }
Michal Koutnýf43caa22020-01-24 12:40:16 +01004498 if (!cset) {
4499 it->task_pos = NULL;
4500 return;
4501 }
4502 it->task_pos = it->cur_tasks_head->next;
Tejun Heoed27b9f2015-10-15 16:41:52 -04004503
4504 /*
4505 * We don't keep css_sets locked across iteration steps and thus
4506 * need to take steps to ensure that iteration can be resumed after
4507 * the lock is re-acquired. Iteration is performed at two levels -
4508 * css_sets and tasks in them.
4509 *
4510 * Once created, a css_set never leaves its cgroup lists, so a
4511 * pinned css_set is guaranteed to stay put and we can resume
4512 * iteration afterwards.
4513 *
4514 * Tasks may leave @cset across iteration steps. This is resolved
4515 * by registering each iterator with the css_set currently being
4516 * walked and making css_set_move_task() advance iterators whose
4517 * next task is leaving.
4518 */
4519 if (it->cur_cset) {
4520 list_del(&it->iters_node);
4521 put_css_set_locked(it->cur_cset);
4522 }
4523 get_css_set(cset);
4524 it->cur_cset = cset;
4525 list_add(&it->iters_node, &cset->task_iters);
Tejun Heod5158762013-08-08 20:11:26 -04004526}
4527
Tejun Heob636fd32019-05-31 10:38:58 -07004528static void css_task_iter_skip(struct css_task_iter *it,
4529 struct task_struct *task)
4530{
4531 lockdep_assert_held(&css_set_lock);
4532
4533 if (it->task_pos == &task->cg_list) {
4534 it->task_pos = it->task_pos->next;
4535 it->flags |= CSS_TASK_ITER_SKIPPED;
4536 }
4537}
4538
Tejun Heoecb9d532015-10-15 16:41:52 -04004539static void css_task_iter_advance(struct css_task_iter *it)
4540{
Tejun Heoc03cd772019-05-31 10:38:58 -07004541 struct task_struct *task;
Tejun Heoecb9d532015-10-15 16:41:52 -04004542
Tejun Heof0d9a5f2015-10-15 16:41:53 -04004543 lockdep_assert_held(&css_set_lock);
Tejun Heobc2fb7e2017-05-15 09:34:01 -04004544repeat:
Tejun Heoe9d81a12018-11-08 12:15:15 -08004545 if (it->task_pos) {
4546 /*
Michal Koutnýf43caa22020-01-24 12:40:16 +01004547 * Advance iterator to find next entry. We go through cset
4548 * tasks, mg_tasks and dying_tasks, when consumed we move onto
4549 * the next cset.
Tejun Heoe9d81a12018-11-08 12:15:15 -08004550 */
Tejun Heob636fd32019-05-31 10:38:58 -07004551 if (it->flags & CSS_TASK_ITER_SKIPPED)
4552 it->flags &= ~CSS_TASK_ITER_SKIPPED;
Tejun Heoe9d81a12018-11-08 12:15:15 -08004553 else
Tejun Heob636fd32019-05-31 10:38:58 -07004554 it->task_pos = it->task_pos->next;
4555
Michal Koutnýf43caa22020-01-24 12:40:16 +01004556 if (it->task_pos == &it->cur_cset->tasks) {
4557 it->cur_tasks_head = &it->cur_cset->mg_tasks;
4558 it->task_pos = it->cur_tasks_head->next;
Michal Koutný9c974c72020-01-24 12:40:15 +01004559 }
Michal Koutnýf43caa22020-01-24 12:40:16 +01004560 if (it->task_pos == &it->cur_cset->mg_tasks) {
4561 it->cur_tasks_head = &it->cur_cset->dying_tasks;
4562 it->task_pos = it->cur_tasks_head->next;
Michal Koutný9c974c72020-01-24 12:40:15 +01004563 }
Michal Koutnýf43caa22020-01-24 12:40:16 +01004564 if (it->task_pos == &it->cur_cset->dying_tasks)
Tejun Heob636fd32019-05-31 10:38:58 -07004565 css_task_iter_advance_css_set(it);
Tejun Heoe9d81a12018-11-08 12:15:15 -08004566 } else {
4567 /* called from start, proceed to the first cset */
Tejun Heoecb9d532015-10-15 16:41:52 -04004568 css_task_iter_advance_css_set(it);
Tejun Heoe9d81a12018-11-08 12:15:15 -08004569 }
Tejun Heobc2fb7e2017-05-15 09:34:01 -04004570
Tejun Heoc03cd772019-05-31 10:38:58 -07004571 if (!it->task_pos)
4572 return;
4573
4574 task = list_entry(it->task_pos, struct task_struct, cg_list);
4575
4576 if (it->flags & CSS_TASK_ITER_PROCS) {
4577 /* if PROCS, skip over tasks which aren't group leaders */
4578 if (!thread_group_leader(task))
4579 goto repeat;
4580
4581 /* and dying leaders w/o live member threads */
Michal Koutnýf43caa22020-01-24 12:40:16 +01004582 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
Michal Koutný9c974c72020-01-24 12:40:15 +01004583 !atomic_read(&task->signal->live))
Tejun Heoc03cd772019-05-31 10:38:58 -07004584 goto repeat;
4585 } else {
4586 /* skip all dying ones */
Michal Koutnýf43caa22020-01-24 12:40:16 +01004587 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
Tejun Heoc03cd772019-05-31 10:38:58 -07004588 goto repeat;
4589 }
Tejun Heoecb9d532015-10-15 16:41:52 -04004590}
4591
Tejun Heo0942eee2013-08-08 20:11:26 -04004592/**
Tejun Heo72ec7022013-08-08 20:11:26 -04004593 * css_task_iter_start - initiate task iteration
4594 * @css: the css to walk tasks of
Tejun Heobc2fb7e2017-05-15 09:34:01 -04004595 * @flags: CSS_TASK_ITER_* flags
Tejun Heo0942eee2013-08-08 20:11:26 -04004596 * @it: the task iterator to use
4597 *
Tejun Heo72ec7022013-08-08 20:11:26 -04004598 * Initiate iteration through the tasks of @css. The caller can call
4599 * css_task_iter_next() to walk through the tasks until the function
4600 * returns NULL. On completion of iteration, css_task_iter_end() must be
4601 * called.
Tejun Heo0942eee2013-08-08 20:11:26 -04004602 */
Tejun Heobc2fb7e2017-05-15 09:34:01 -04004603void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
Tejun Heo72ec7022013-08-08 20:11:26 -04004604 struct css_task_iter *it)
Paul Menage817929e2007-10-18 23:39:36 -07004605{
Tejun Heoed27b9f2015-10-15 16:41:52 -04004606 memset(it, 0, sizeof(*it));
4607
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03004608 spin_lock_irq(&css_set_lock);
Tejun Heoc59cd3d2013-08-08 20:11:26 -04004609
Tejun Heo3ebb2b62014-04-23 11:13:15 -04004610 it->ss = css->ss;
Tejun Heobc2fb7e2017-05-15 09:34:01 -04004611 it->flags = flags;
Tejun Heo3ebb2b62014-04-23 11:13:15 -04004612
4613 if (it->ss)
4614 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4615 else
4616 it->cset_pos = &css->cgroup->cset_links;
4617
Tejun Heo0f0a2b42014-04-23 11:13:15 -04004618 it->cset_head = it->cset_pos;
Tejun Heoc59cd3d2013-08-08 20:11:26 -04004619
Tejun Heoe9d81a12018-11-08 12:15:15 -08004620 css_task_iter_advance(it);
Tejun Heoed27b9f2015-10-15 16:41:52 -04004621
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03004622 spin_unlock_irq(&css_set_lock);
Paul Menagebd89aab2007-10-18 23:40:44 -07004623}
Paul Menage817929e2007-10-18 23:39:36 -07004624
Tejun Heo0942eee2013-08-08 20:11:26 -04004625/**
Tejun Heo72ec7022013-08-08 20:11:26 -04004626 * css_task_iter_next - return the next task for the iterator
Tejun Heo0942eee2013-08-08 20:11:26 -04004627 * @it: the task iterator being iterated
4628 *
4629 * The "next" function for task iteration. @it should have been
Tejun Heo72ec7022013-08-08 20:11:26 -04004630 * initialized via css_task_iter_start(). Returns NULL when the iteration
4631 * reaches the end.
Tejun Heo0942eee2013-08-08 20:11:26 -04004632 */
Tejun Heo72ec7022013-08-08 20:11:26 -04004633struct task_struct *css_task_iter_next(struct css_task_iter *it)
Paul Menage817929e2007-10-18 23:39:36 -07004634{
Tejun Heod5745672015-10-29 11:43:05 +09004635 if (it->cur_task) {
Tejun Heoed27b9f2015-10-15 16:41:52 -04004636 put_task_struct(it->cur_task);
Tejun Heod5745672015-10-29 11:43:05 +09004637 it->cur_task = NULL;
4638 }
Tejun Heoed27b9f2015-10-15 16:41:52 -04004639
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03004640 spin_lock_irq(&css_set_lock);
Tejun Heoed27b9f2015-10-15 16:41:52 -04004641
Tejun Heocee0c332019-06-05 09:54:34 -07004642 /* @it may be half-advanced by skips, finish advancing */
4643 if (it->flags & CSS_TASK_ITER_SKIPPED)
4644 css_task_iter_advance(it);
4645
Tejun Heod5745672015-10-29 11:43:05 +09004646 if (it->task_pos) {
4647 it->cur_task = list_entry(it->task_pos, struct task_struct,
4648 cg_list);
4649 get_task_struct(it->cur_task);
4650 css_task_iter_advance(it);
4651 }
Tejun Heoed27b9f2015-10-15 16:41:52 -04004652
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03004653 spin_unlock_irq(&css_set_lock);
Tejun Heoed27b9f2015-10-15 16:41:52 -04004654
4655 return it->cur_task;
Paul Menage817929e2007-10-18 23:39:36 -07004656}
4657
Tejun Heo0942eee2013-08-08 20:11:26 -04004658/**
Tejun Heo72ec7022013-08-08 20:11:26 -04004659 * css_task_iter_end - finish task iteration
Tejun Heo0942eee2013-08-08 20:11:26 -04004660 * @it: the task iterator to finish
4661 *
Tejun Heo72ec7022013-08-08 20:11:26 -04004662 * Finish task iteration started by css_task_iter_start().
Tejun Heo0942eee2013-08-08 20:11:26 -04004663 */
Tejun Heo72ec7022013-08-08 20:11:26 -04004664void css_task_iter_end(struct css_task_iter *it)
Paul Menage817929e2007-10-18 23:39:36 -07004665{
Tejun Heoed27b9f2015-10-15 16:41:52 -04004666 if (it->cur_cset) {
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03004667 spin_lock_irq(&css_set_lock);
Tejun Heoed27b9f2015-10-15 16:41:52 -04004668 list_del(&it->iters_node);
4669 put_css_set_locked(it->cur_cset);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03004670 spin_unlock_irq(&css_set_lock);
Tejun Heoed27b9f2015-10-15 16:41:52 -04004671 }
4672
Tejun Heo450ee0c2017-05-15 09:34:03 -04004673 if (it->cur_dcset)
4674 put_css_set(it->cur_dcset);
4675
Tejun Heoed27b9f2015-10-15 16:41:52 -04004676 if (it->cur_task)
4677 put_task_struct(it->cur_task);
Tejun Heo8cc99342013-04-07 09:29:50 -07004678}
4679
Tejun Heob4b90a82016-12-27 14:49:04 -05004680static void cgroup_procs_release(struct kernfs_open_file *of)
Tejun Heo8cc99342013-04-07 09:29:50 -07004681{
Tejun Heob4b90a82016-12-27 14:49:04 -05004682 if (of->priv) {
4683 css_task_iter_end(of->priv);
4684 kfree(of->priv);
4685 }
4686}
4687
4688static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4689{
4690 struct kernfs_open_file *of = s->private;
4691 struct css_task_iter *it = of->priv;
Tejun Heoe406d1c2014-02-13 06:58:39 -05004692
Vasily Averin2d4ecb02020-01-30 13:34:59 +03004693 if (pos)
4694 (*pos)++;
4695
Tejun Heobc2fb7e2017-05-15 09:34:01 -04004696 return css_task_iter_next(it);
Tejun Heo8cc99342013-04-07 09:29:50 -07004697}
4698
Tejun Heo8cfd8142017-07-21 11:14:51 -04004699static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4700 unsigned int iter_flags)
Ben Blumd1d9fd32009-09-23 15:56:28 -07004701{
Tejun Heo2bd59d42014-02-11 11:52:49 -05004702 struct kernfs_open_file *of = s->private;
Tejun Heo7da11272013-12-05 12:28:04 -05004703 struct cgroup *cgrp = seq_css(s)->cgroup;
Tejun Heob4b90a82016-12-27 14:49:04 -05004704 struct css_task_iter *it = of->priv;
Tejun Heo4bac00d2013-11-29 10:42:59 -05004705
4706 /*
Tejun Heob4b90a82016-12-27 14:49:04 -05004707 * When a seq_file is seeked, it's always traversed sequentially
4708 * from position 0, so we can simply keep iterating on !0 *pos.
Tejun Heo4bac00d2013-11-29 10:42:59 -05004709 */
Tejun Heob4b90a82016-12-27 14:49:04 -05004710 if (!it) {
Vasily Averin2d4ecb02020-01-30 13:34:59 +03004711 if (WARN_ON_ONCE((*pos)))
Tejun Heob4b90a82016-12-27 14:49:04 -05004712 return ERR_PTR(-EINVAL);
Tejun Heo4bac00d2013-11-29 10:42:59 -05004713
Tejun Heob4b90a82016-12-27 14:49:04 -05004714 it = kzalloc(sizeof(*it), GFP_KERNEL);
4715 if (!it)
4716 return ERR_PTR(-ENOMEM);
4717 of->priv = it;
Tejun Heo450ee0c2017-05-15 09:34:03 -04004718 css_task_iter_start(&cgrp->self, iter_flags, it);
Vasily Averin2d4ecb02020-01-30 13:34:59 +03004719 } else if (!(*pos)) {
Tejun Heob4b90a82016-12-27 14:49:04 -05004720 css_task_iter_end(it);
Tejun Heo450ee0c2017-05-15 09:34:03 -04004721 css_task_iter_start(&cgrp->self, iter_flags, it);
Vasily Averin2d4ecb02020-01-30 13:34:59 +03004722 } else
4723 return it->cur_task;
Tejun Heo4bac00d2013-11-29 10:42:59 -05004724
Tejun Heob4b90a82016-12-27 14:49:04 -05004725 return cgroup_procs_next(s, NULL, NULL);
Paul Menagebbcb81d2007-10-18 23:39:32 -07004726}
4727
Tejun Heo8cfd8142017-07-21 11:14:51 -04004728static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4729{
4730 struct cgroup *cgrp = seq_css(s)->cgroup;
4731
4732 /*
4733 * All processes of a threaded subtree belong to the domain cgroup
4734 * of the subtree. Only threads can be distributed across the
4735 * subtree. Reject reads on cgroup.procs in the subtree proper.
4736 * They're always empty anyway.
4737 */
4738 if (cgroup_is_threaded(cgrp))
4739 return ERR_PTR(-EOPNOTSUPP);
4740
4741 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4742 CSS_TASK_ITER_THREADED);
4743}
4744
Tejun Heob4b90a82016-12-27 14:49:04 -05004745static int cgroup_procs_show(struct seq_file *s, void *v)
Paul Menagecc31edc2008-10-18 20:28:04 -07004746{
Tejun Heobc2fb7e2017-05-15 09:34:01 -04004747 seq_printf(s, "%d\n", task_pid_vnr(v));
Daniel Lezcano97978e62010-10-27 15:33:35 -07004748 return 0;
4749}
4750
Christian Braunerf3553222020-02-05 14:26:21 +01004751static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4752{
4753 int ret;
4754 struct inode *inode;
4755
4756 lockdep_assert_held(&cgroup_mutex);
4757
4758 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4759 if (!inode)
4760 return -ENOMEM;
4761
Christian Brauner47291ba2021-01-21 14:19:24 +01004762 ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
Christian Braunerf3553222020-02-05 14:26:21 +01004763 iput(inode);
4764 return ret;
4765}
4766
Tejun Heo715c8092017-05-15 09:34:00 -04004767static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4768 struct cgroup *dst_cgrp,
4769 struct super_block *sb)
4770{
4771 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4772 struct cgroup *com_cgrp = src_cgrp;
Tejun Heo715c8092017-05-15 09:34:00 -04004773 int ret;
4774
4775 lockdep_assert_held(&cgroup_mutex);
4776
4777 /* find the common ancestor */
4778 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4779 com_cgrp = cgroup_parent(com_cgrp);
4780
4781 /* %current should be authorized to migrate to the common ancestor */
Christian Braunerf3553222020-02-05 14:26:21 +01004782 ret = cgroup_may_write(com_cgrp, sb);
Tejun Heo715c8092017-05-15 09:34:00 -04004783 if (ret)
4784 return ret;
4785
4786 /*
4787 * If namespaces are delegation boundaries, %current must be able
4788 * to see both source and destination cgroups from its namespace.
4789 */
4790 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4791 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4792 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4793 return -ENOENT;
4794
4795 return 0;
4796}
4797
Christian Brauner6df970e2020-02-05 14:26:18 +01004798static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4799 struct cgroup *dst_cgrp,
4800 struct super_block *sb, bool threadgroup)
4801{
4802 int ret = 0;
4803
4804 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
4805 if (ret)
4806 return ret;
4807
4808 ret = cgroup_migrate_vet_dst(dst_cgrp);
4809 if (ret)
4810 return ret;
4811
4812 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4813 ret = -EOPNOTSUPP;
4814
4815 return ret;
4816}
4817
Michal Koutnýda708622021-01-14 13:44:27 +01004818static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
4819 bool threadgroup)
Tejun Heo715c8092017-05-15 09:34:00 -04004820{
4821 struct cgroup *src_cgrp, *dst_cgrp;
4822 struct task_struct *task;
4823 ssize_t ret;
Michal Koutný9a3284f2019-10-04 12:57:40 +02004824 bool locked;
Tejun Heo715c8092017-05-15 09:34:00 -04004825
4826 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4827 if (!dst_cgrp)
4828 return -ENODEV;
4829
Michal Koutnýda708622021-01-14 13:44:27 +01004830 task = cgroup_procs_write_start(buf, threadgroup, &locked);
Tejun Heo715c8092017-05-15 09:34:00 -04004831 ret = PTR_ERR_OR_ZERO(task);
4832 if (ret)
4833 goto out_unlock;
4834
4835 /* find the source cgroup */
4836 spin_lock_irq(&css_set_lock);
4837 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4838 spin_unlock_irq(&css_set_lock);
4839
Michal Koutnýda708622021-01-14 13:44:27 +01004840 /* process and thread migrations follow same delegation rule */
Christian Brauner6df970e2020-02-05 14:26:18 +01004841 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
Michal Koutnýda708622021-01-14 13:44:27 +01004842 of->file->f_path.dentry->d_sb, threadgroup);
Tejun Heo715c8092017-05-15 09:34:00 -04004843 if (ret)
4844 goto out_finish;
4845
Michal Koutnýda708622021-01-14 13:44:27 +01004846 ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
Tejun Heo715c8092017-05-15 09:34:00 -04004847
4848out_finish:
Michal Koutný9a3284f2019-10-04 12:57:40 +02004849 cgroup_procs_write_finish(task, locked);
Tejun Heo715c8092017-05-15 09:34:00 -04004850out_unlock:
4851 cgroup_kn_unlock(of->kn);
4852
Michal Koutnýda708622021-01-14 13:44:27 +01004853 return ret;
4854}
4855
4856static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4857 char *buf, size_t nbytes, loff_t off)
4858{
4859 return __cgroup_procs_write(of, buf, true) ?: nbytes;
Tejun Heo715c8092017-05-15 09:34:00 -04004860}
4861
Tejun Heo8cfd8142017-07-21 11:14:51 -04004862static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4863{
4864 return __cgroup_procs_start(s, pos, 0);
4865}
4866
4867static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4868 char *buf, size_t nbytes, loff_t off)
4869{
Michal Koutnýda708622021-01-14 13:44:27 +01004870 return __cgroup_procs_write(of, buf, false) ?: nbytes;
Tejun Heo8cfd8142017-07-21 11:14:51 -04004871}
4872
Tejun Heoa14c6872014-07-15 11:05:09 -04004873/* cgroup core interface files for the default hierarchy */
Tejun Heod62beb72016-12-27 14:49:08 -05004874static struct cftype cgroup_base_files[] = {
Tejun Heoa14c6872014-07-15 11:05:09 -04004875 {
Tejun Heo8cfd8142017-07-21 11:14:51 -04004876 .name = "cgroup.type",
4877 .flags = CFTYPE_NOT_ON_ROOT,
4878 .seq_show = cgroup_type_show,
4879 .write = cgroup_type_write,
4880 },
4881 {
Tejun Heoa14c6872014-07-15 11:05:09 -04004882 .name = "cgroup.procs",
Tejun Heo5136f632017-06-27 14:30:28 -04004883 .flags = CFTYPE_NS_DELEGATABLE,
Tejun Heo6f60ead2015-09-18 17:54:23 -04004884 .file_offset = offsetof(struct cgroup, procs_file),
Tejun Heob4b90a82016-12-27 14:49:04 -05004885 .release = cgroup_procs_release,
4886 .seq_start = cgroup_procs_start,
4887 .seq_next = cgroup_procs_next,
4888 .seq_show = cgroup_procs_show,
Tejun Heoa14c6872014-07-15 11:05:09 -04004889 .write = cgroup_procs_write,
Tejun Heoa14c6872014-07-15 11:05:09 -04004890 },
4891 {
Tejun Heo8cfd8142017-07-21 11:14:51 -04004892 .name = "cgroup.threads",
Roman Gushchin4f584242018-01-10 04:35:12 -08004893 .flags = CFTYPE_NS_DELEGATABLE,
Tejun Heo8cfd8142017-07-21 11:14:51 -04004894 .release = cgroup_procs_release,
4895 .seq_start = cgroup_threads_start,
4896 .seq_next = cgroup_procs_next,
4897 .seq_show = cgroup_procs_show,
4898 .write = cgroup_threads_write,
4899 },
4900 {
Tejun Heoa14c6872014-07-15 11:05:09 -04004901 .name = "cgroup.controllers",
Tejun Heoa14c6872014-07-15 11:05:09 -04004902 .seq_show = cgroup_controllers_show,
4903 },
4904 {
4905 .name = "cgroup.subtree_control",
Tejun Heo5136f632017-06-27 14:30:28 -04004906 .flags = CFTYPE_NS_DELEGATABLE,
Tejun Heoa14c6872014-07-15 11:05:09 -04004907 .seq_show = cgroup_subtree_control_show,
4908 .write = cgroup_subtree_control_write,
4909 },
4910 {
Tejun Heo4a07c222015-09-18 17:54:22 -04004911 .name = "cgroup.events",
Tejun Heoa14c6872014-07-15 11:05:09 -04004912 .flags = CFTYPE_NOT_ON_ROOT,
Tejun Heo6f60ead2015-09-18 17:54:23 -04004913 .file_offset = offsetof(struct cgroup, events_file),
Tejun Heo4a07c222015-09-18 17:54:22 -04004914 .seq_show = cgroup_events_show,
Tejun Heoa14c6872014-07-15 11:05:09 -04004915 },
Roman Gushchin1a926e02017-07-28 18:28:44 +01004916 {
4917 .name = "cgroup.max.descendants",
4918 .seq_show = cgroup_max_descendants_show,
4919 .write = cgroup_max_descendants_write,
4920 },
4921 {
4922 .name = "cgroup.max.depth",
4923 .seq_show = cgroup_max_depth_show,
4924 .write = cgroup_max_depth_write,
4925 },
Roman Gushchinec392252017-08-02 17:55:31 +01004926 {
4927 .name = "cgroup.stat",
Tejun Heo3e489302017-08-11 05:49:01 -07004928 .seq_show = cgroup_stat_show,
Roman Gushchinec392252017-08-02 17:55:31 +01004929 },
Tejun Heod41bf8c2017-10-23 16:18:27 -07004930 {
Roman Gushchin76f969e2019-04-19 10:03:04 -07004931 .name = "cgroup.freeze",
4932 .flags = CFTYPE_NOT_ON_ROOT,
4933 .seq_show = cgroup_freeze_show,
4934 .write = cgroup_freeze_write,
4935 },
4936 {
Christian Brauner661ee622021-05-08 14:15:38 +02004937 .name = "cgroup.kill",
4938 .flags = CFTYPE_NOT_ON_ROOT,
4939 .write = cgroup_kill_write,
4940 },
4941 {
Tejun Heod41bf8c2017-10-23 16:18:27 -07004942 .name = "cpu.stat",
Tejun Heod41bf8c2017-10-23 16:18:27 -07004943 .seq_show = cpu_stat_show,
4944 },
Johannes Weiner2ce71352018-10-26 15:06:31 -07004945#ifdef CONFIG_PSI
4946 {
4947 .name = "io.pressure",
Johannes Weiner2ce71352018-10-26 15:06:31 -07004948 .seq_show = cgroup_io_pressure_show,
Suren Baghdasaryan0e946822019-05-14 15:41:15 -07004949 .write = cgroup_io_pressure_write,
4950 .poll = cgroup_pressure_poll,
4951 .release = cgroup_pressure_release,
Johannes Weiner2ce71352018-10-26 15:06:31 -07004952 },
4953 {
4954 .name = "memory.pressure",
Johannes Weiner2ce71352018-10-26 15:06:31 -07004955 .seq_show = cgroup_memory_pressure_show,
Suren Baghdasaryan0e946822019-05-14 15:41:15 -07004956 .write = cgroup_memory_pressure_write,
4957 .poll = cgroup_pressure_poll,
4958 .release = cgroup_pressure_release,
Johannes Weiner2ce71352018-10-26 15:06:31 -07004959 },
4960 {
4961 .name = "cpu.pressure",
Johannes Weiner2ce71352018-10-26 15:06:31 -07004962 .seq_show = cgroup_cpu_pressure_show,
Suren Baghdasaryan0e946822019-05-14 15:41:15 -07004963 .write = cgroup_cpu_pressure_write,
4964 .poll = cgroup_pressure_poll,
4965 .release = cgroup_pressure_release,
Johannes Weiner2ce71352018-10-26 15:06:31 -07004966 },
Suren Baghdasaryan0e946822019-05-14 15:41:15 -07004967#endif /* CONFIG_PSI */
Tejun Heoa14c6872014-07-15 11:05:09 -04004968 { } /* terminate */
4969};
4970
Tejun Heo0c21ead2013-08-13 20:22:51 -04004971/*
4972 * css destruction is four-stage process.
4973 *
4974 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4975 * Implemented in kill_css().
4976 *
4977 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
Tejun Heoec903c02014-05-13 12:11:01 -04004978 * and thus css_tryget_online() is guaranteed to fail, the css can be
4979 * offlined by invoking offline_css(). After offlining, the base ref is
4980 * put. Implemented in css_killed_work_fn().
Tejun Heo0c21ead2013-08-13 20:22:51 -04004981 *
4982 * 3. When the percpu_ref reaches zero, the only possible remaining
4983 * accessors are inside RCU read sections. css_release() schedules the
4984 * RCU callback.
4985 *
4986 * 4. After the grace period, the css can be freed. Implemented in
4987 * css_free_work_fn().
4988 *
4989 * It is actually hairier because both step 2 and 4 require process context
4990 * and thus involve punting to css->destroy_work adding two additional
4991 * steps to the already complex sequence.
4992 */
Tejun Heo8f36aae2018-03-14 12:45:14 -07004993static void css_free_rwork_fn(struct work_struct *work)
Tejun Heo48ddbe12012-04-01 12:09:56 -07004994{
Tejun Heo8f36aae2018-03-14 12:45:14 -07004995 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
4996 struct cgroup_subsys_state, destroy_rwork);
Vladimir Davydov01e58652015-02-12 14:59:26 -08004997 struct cgroup_subsys *ss = css->ss;
Tejun Heo0c21ead2013-08-13 20:22:51 -04004998 struct cgroup *cgrp = css->cgroup;
Tejun Heo48ddbe12012-04-01 12:09:56 -07004999
Tejun Heo9a1049d2014-06-28 08:10:14 -04005000 percpu_ref_exit(&css->refcnt);
5001
Vladimir Davydov01e58652015-02-12 14:59:26 -08005002 if (ss) {
Tejun Heo9d755d32014-05-14 09:15:02 -04005003 /* css free path */
Tejun Heo8bb5ef72016-01-21 15:32:15 -05005004 struct cgroup_subsys_state *parent = css->parent;
Vladimir Davydov01e58652015-02-12 14:59:26 -08005005 int id = css->id;
5006
Vladimir Davydov01e58652015-02-12 14:59:26 -08005007 ss->css_free(css);
5008 cgroup_idr_remove(&ss->css_idr, id);
Tejun Heo9d755d32014-05-14 09:15:02 -04005009 cgroup_put(cgrp);
Tejun Heo8bb5ef72016-01-21 15:32:15 -05005010
5011 if (parent)
5012 css_put(parent);
Tejun Heo9d755d32014-05-14 09:15:02 -04005013 } else {
5014 /* cgroup free path */
5015 atomic_dec(&cgrp->root->nr_cgrps);
Tejun Heod62beb72016-12-27 14:49:08 -05005016 cgroup1_pidlist_destroy_all(cgrp);
Zefan Li971ff492014-09-18 16:06:19 +08005017 cancel_work_sync(&cgrp->release_agent_work);
Tejun Heo9d755d32014-05-14 09:15:02 -04005018
Tejun Heod51f39b2014-05-16 13:22:48 -04005019 if (cgroup_parent(cgrp)) {
Tejun Heo9d755d32014-05-14 09:15:02 -04005020 /*
5021 * We get a ref to the parent, and put the ref when
5022 * this cgroup is being freed, so it's guaranteed
5023 * that the parent won't be destroyed before its
5024 * children.
5025 */
Tejun Heod51f39b2014-05-16 13:22:48 -04005026 cgroup_put(cgroup_parent(cgrp));
Tejun Heo9d755d32014-05-14 09:15:02 -04005027 kernfs_put(cgrp->kn);
Johannes Weiner2ce71352018-10-26 15:06:31 -07005028 psi_cgroup_free(cgrp);
Johannes Weinera7df69b2021-04-29 22:56:20 -07005029 cgroup_rstat_exit(cgrp);
Tejun Heo9d755d32014-05-14 09:15:02 -04005030 kfree(cgrp);
5031 } else {
5032 /*
5033 * This is root cgroup's refcnt reaching zero,
5034 * which indicates that the root should be
5035 * released.
5036 */
5037 cgroup_destroy_root(cgrp->root);
5038 }
5039 }
Tejun Heo0c21ead2013-08-13 20:22:51 -04005040}
5041
Tejun Heo25e15d82014-05-14 09:15:02 -04005042static void css_release_work_fn(struct work_struct *work)
Tejun Heod3daf282013-06-13 19:39:16 -07005043{
5044 struct cgroup_subsys_state *css =
Tejun Heo25e15d82014-05-14 09:15:02 -04005045 container_of(work, struct cgroup_subsys_state, destroy_work);
Tejun Heo15a4c832014-05-04 15:09:14 -04005046 struct cgroup_subsys *ss = css->ss;
Tejun Heo9d755d32014-05-14 09:15:02 -04005047 struct cgroup *cgrp = css->cgroup;
Tejun Heod3daf282013-06-13 19:39:16 -07005048
Tejun Heo1fed1b22014-05-16 13:22:49 -04005049 mutex_lock(&cgroup_mutex);
5050
Tejun Heode3f0342014-05-16 13:22:49 -04005051 css->flags |= CSS_RELEASED;
Tejun Heo1fed1b22014-05-16 13:22:49 -04005052 list_del_rcu(&css->sibling);
5053
Tejun Heo9d755d32014-05-14 09:15:02 -04005054 if (ss) {
5055 /* css release path */
Tejun Heo8f534702018-04-26 14:29:05 -07005056 if (!list_empty(&css->rstat_css_node)) {
5057 cgroup_rstat_flush(cgrp);
5058 list_del_rcu(&css->rstat_css_node);
5059 }
5060
Vladimir Davydov01e58652015-02-12 14:59:26 -08005061 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
Tejun Heo7d172cc2014-11-18 02:49:51 -05005062 if (ss->css_released)
5063 ss->css_released(css);
Tejun Heo9d755d32014-05-14 09:15:02 -04005064 } else {
Roman Gushchin0679dee2017-08-02 17:55:29 +01005065 struct cgroup *tcgrp;
5066
Tejun Heo9d755d32014-05-14 09:15:02 -04005067 /* cgroup release path */
Steven Rostedt (VMware)e4f8d812018-07-09 17:48:54 -04005068 TRACE_CGROUP_PATH(release, cgrp);
Tejun Heoed1777d2016-08-10 11:23:44 -04005069
Johannes Weinera7df69b2021-04-29 22:56:20 -07005070 cgroup_rstat_flush(cgrp);
Tejun Heo041cd642017-09-25 08:12:05 -07005071
Roman Gushchin4dcabec2019-04-19 10:03:03 -07005072 spin_lock_irq(&css_set_lock);
Roman Gushchin0679dee2017-08-02 17:55:29 +01005073 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5074 tcgrp = cgroup_parent(tcgrp))
5075 tcgrp->nr_dying_descendants--;
Roman Gushchin4dcabec2019-04-19 10:03:03 -07005076 spin_unlock_irq(&css_set_lock);
Roman Gushchin0679dee2017-08-02 17:55:29 +01005077
Li Zefana4189482014-09-04 14:43:07 +08005078 /*
5079 * There are two control paths which try to determine
5080 * cgroup from dentry without going through kernfs -
5081 * cgroupstats_build() and css_tryget_online_from_dir().
5082 * Those are supported by RCU protecting clearing of
5083 * cgrp->kn->priv backpointer.
5084 */
Tejun Heo6cd0f5b2016-03-03 09:57:58 -05005085 if (cgrp->kn)
5086 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5087 NULL);
Tejun Heo9d755d32014-05-14 09:15:02 -04005088 }
Tejun Heo15a4c832014-05-04 15:09:14 -04005089
Tejun Heo1fed1b22014-05-16 13:22:49 -04005090 mutex_unlock(&cgroup_mutex);
5091
Tejun Heo8f36aae2018-03-14 12:45:14 -07005092 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5093 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
Tejun Heod3daf282013-06-13 19:39:16 -07005094}
5095
Tejun Heo48ddbe12012-04-01 12:09:56 -07005096static void css_release(struct percpu_ref *ref)
5097{
5098 struct cgroup_subsys_state *css =
5099 container_of(ref, struct cgroup_subsys_state, refcnt);
Tejun Heo5db9a4d2012-07-07 16:08:18 -07005100
Tejun Heo25e15d82014-05-14 09:15:02 -04005101 INIT_WORK(&css->destroy_work, css_release_work_fn);
5102 queue_work(cgroup_destroy_wq, &css->destroy_work);
Tejun Heo48ddbe12012-04-01 12:09:56 -07005103}
5104
Tejun Heoddfcada2014-05-04 15:09:14 -04005105static void init_and_link_css(struct cgroup_subsys_state *css,
5106 struct cgroup_subsys *ss, struct cgroup *cgrp)
Paul Menageddbcc7e2007-10-18 23:39:30 -07005107{
Tejun Heo0cb51d72014-05-16 13:22:49 -04005108 lockdep_assert_held(&cgroup_mutex);
5109
Tejun Heoa590b902017-04-28 15:14:55 -04005110 cgroup_get_live(cgrp);
Tejun Heoddfcada2014-05-04 15:09:14 -04005111
Tejun Heod5c419b2014-05-16 13:22:48 -04005112 memset(css, 0, sizeof(*css));
Paul Menagebd89aab2007-10-18 23:40:44 -07005113 css->cgroup = cgrp;
Tejun Heo72c97e52013-08-08 20:11:22 -04005114 css->ss = ss;
Tejun Heo8fa3b8d2016-05-26 15:42:13 -04005115 css->id = -1;
Tejun Heod5c419b2014-05-16 13:22:48 -04005116 INIT_LIST_HEAD(&css->sibling);
5117 INIT_LIST_HEAD(&css->children);
Tejun Heo8f534702018-04-26 14:29:05 -07005118 INIT_LIST_HEAD(&css->rstat_css_node);
Tejun Heo0cb51d72014-05-16 13:22:49 -04005119 css->serial_nr = css_serial_nr_next++;
Tejun Heoaa226ff2016-01-21 15:31:11 -05005120 atomic_set(&css->online_cnt, 0);
Tejun Heo48ddbe12012-04-01 12:09:56 -07005121
Tejun Heod51f39b2014-05-16 13:22:48 -04005122 if (cgroup_parent(cgrp)) {
5123 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
Tejun Heoddfcada2014-05-04 15:09:14 -04005124 css_get(css->parent);
Tejun Heoddfcada2014-05-04 15:09:14 -04005125 }
Tejun Heo0ae78e02013-08-13 11:01:54 -04005126
Johannes Weinera7df69b2021-04-29 22:56:20 -07005127 if (ss->css_rstat_flush)
Tejun Heo8f534702018-04-26 14:29:05 -07005128 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5129
Tejun Heoca8bdca2013-08-26 18:40:56 -04005130 BUG_ON(cgroup_css(cgrp, ss));
Paul Menageddbcc7e2007-10-18 23:39:30 -07005131}
5132
Li Zefan2a4ac632013-07-31 16:16:40 +08005133/* invoke ->css_online() on a new CSS and mark it online if successful */
Tejun Heo623f9262013-08-13 11:01:55 -04005134static int online_css(struct cgroup_subsys_state *css)
Tejun Heoa31f2d32012-11-19 08:13:37 -08005135{
Tejun Heo623f9262013-08-13 11:01:55 -04005136 struct cgroup_subsys *ss = css->ss;
Tejun Heob1929db2012-11-19 08:13:38 -08005137 int ret = 0;
5138
Tejun Heoa31f2d32012-11-19 08:13:37 -08005139 lockdep_assert_held(&cgroup_mutex);
5140
Tejun Heo92fb9742012-11-19 08:13:38 -08005141 if (ss->css_online)
Tejun Heoeb954192013-08-08 20:11:23 -04005142 ret = ss->css_online(css);
Tejun Heoae7f1642013-08-13 20:22:50 -04005143 if (!ret) {
Tejun Heoeb954192013-08-08 20:11:23 -04005144 css->flags |= CSS_ONLINE;
Tejun Heoaec25022014-02-08 10:36:58 -05005145 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
Tejun Heoaa226ff2016-01-21 15:31:11 -05005146
5147 atomic_inc(&css->online_cnt);
5148 if (css->parent)
5149 atomic_inc(&css->parent->online_cnt);
Tejun Heoae7f1642013-08-13 20:22:50 -04005150 }
Tejun Heob1929db2012-11-19 08:13:38 -08005151 return ret;
Tejun Heoa31f2d32012-11-19 08:13:37 -08005152}
5153
Li Zefan2a4ac632013-07-31 16:16:40 +08005154/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
Tejun Heo623f9262013-08-13 11:01:55 -04005155static void offline_css(struct cgroup_subsys_state *css)
Tejun Heoa31f2d32012-11-19 08:13:37 -08005156{
Tejun Heo623f9262013-08-13 11:01:55 -04005157 struct cgroup_subsys *ss = css->ss;
Tejun Heoa31f2d32012-11-19 08:13:37 -08005158
5159 lockdep_assert_held(&cgroup_mutex);
5160
5161 if (!(css->flags & CSS_ONLINE))
5162 return;
5163
Li Zefand7eeac12013-03-12 15:35:59 -07005164 if (ss->css_offline)
Tejun Heoeb954192013-08-08 20:11:23 -04005165 ss->css_offline(css);
Tejun Heoa31f2d32012-11-19 08:13:37 -08005166
Tejun Heoeb954192013-08-08 20:11:23 -04005167 css->flags &= ~CSS_ONLINE;
Tejun Heoe3297802014-04-23 11:13:15 -04005168 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
Tejun Heof8f22e52014-04-23 11:13:16 -04005169
5170 wake_up_all(&css->cgroup->offline_waitq);
Tejun Heoa31f2d32012-11-19 08:13:37 -08005171}
5172
Tejun Heoc81c925a2013-12-06 15:11:56 -05005173/**
Tejun Heo6cd0f5b2016-03-03 09:57:58 -05005174 * css_create - create a cgroup_subsys_state
Tejun Heoc81c925a2013-12-06 15:11:56 -05005175 * @cgrp: the cgroup new css will be associated with
5176 * @ss: the subsys of new css
5177 *
5178 * Create a new css associated with @cgrp - @ss pair. On success, the new
Tejun Heo6cd0f5b2016-03-03 09:57:58 -05005179 * css is online and installed in @cgrp. This function doesn't create the
5180 * interface files. Returns 0 on success, -errno on failure.
Tejun Heoc81c925a2013-12-06 15:11:56 -05005181 */
Tejun Heo6cd0f5b2016-03-03 09:57:58 -05005182static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5183 struct cgroup_subsys *ss)
Tejun Heoc81c925a2013-12-06 15:11:56 -05005184{
Tejun Heod51f39b2014-05-16 13:22:48 -04005185 struct cgroup *parent = cgroup_parent(cgrp);
Tejun Heo1fed1b22014-05-16 13:22:49 -04005186 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
Tejun Heoc81c925a2013-12-06 15:11:56 -05005187 struct cgroup_subsys_state *css;
5188 int err;
5189
Tejun Heoc81c925a2013-12-06 15:11:56 -05005190 lockdep_assert_held(&cgroup_mutex);
5191
Tejun Heo1fed1b22014-05-16 13:22:49 -04005192 css = ss->css_alloc(parent_css);
Tejun Heoe7e15b82016-06-21 13:06:24 -04005193 if (!css)
5194 css = ERR_PTR(-ENOMEM);
Tejun Heoc81c925a2013-12-06 15:11:56 -05005195 if (IS_ERR(css))
Tejun Heo6cd0f5b2016-03-03 09:57:58 -05005196 return css;
Tejun Heoc81c925a2013-12-06 15:11:56 -05005197
Tejun Heoddfcada2014-05-04 15:09:14 -04005198 init_and_link_css(css, ss, cgrp);
Tejun Heoa2bed822014-05-04 15:09:14 -04005199
Tejun Heo2aad2a82014-09-24 13:31:50 -04005200 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
Tejun Heoc81c925a2013-12-06 15:11:56 -05005201 if (err)
Li Zefan3eb59ec2014-03-18 17:02:36 +08005202 goto err_free_css;
Tejun Heoc81c925a2013-12-06 15:11:56 -05005203
Vladimir Davydovcf780b72015-08-03 15:32:26 +03005204 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
Tejun Heo15a4c832014-05-04 15:09:14 -04005205 if (err < 0)
Wenwei Taob00c52d2016-05-13 22:59:20 +08005206 goto err_free_css;
Tejun Heo15a4c832014-05-04 15:09:14 -04005207 css->id = err;
Tejun Heoc81c925a2013-12-06 15:11:56 -05005208
Tejun Heo15a4c832014-05-04 15:09:14 -04005209 /* @css is ready to be brought online now, make it visible */
Tejun Heo1fed1b22014-05-16 13:22:49 -04005210 list_add_tail_rcu(&css->sibling, &parent_css->children);
Tejun Heo15a4c832014-05-04 15:09:14 -04005211 cgroup_idr_replace(&ss->css_idr, css, css->id);
Tejun Heoc81c925a2013-12-06 15:11:56 -05005212
5213 err = online_css(css);
5214 if (err)
Tejun Heo1fed1b22014-05-16 13:22:49 -04005215 goto err_list_del;
Tejun Heo94419622014-03-19 10:23:54 -04005216
Tejun Heo6cd0f5b2016-03-03 09:57:58 -05005217 return css;
Tejun Heoc81c925a2013-12-06 15:11:56 -05005218
Tejun Heo1fed1b22014-05-16 13:22:49 -04005219err_list_del:
5220 list_del_rcu(&css->sibling);
Li Zefan3eb59ec2014-03-18 17:02:36 +08005221err_free_css:
Tejun Heo8f534702018-04-26 14:29:05 -07005222 list_del_rcu(&css->rstat_css_node);
Tejun Heo8f36aae2018-03-14 12:45:14 -07005223 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5224 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
Tejun Heo6cd0f5b2016-03-03 09:57:58 -05005225 return ERR_PTR(err);
Tejun Heoc81c925a2013-12-06 15:11:56 -05005226}
5227
Tejun Heo07cd1292017-01-26 16:47:28 -05005228/*
5229 * The returned cgroup is fully initialized including its control mask, but
5230 * it isn't associated with its kernfs_node and doesn't have the control
5231 * mask applied.
5232 */
Tejun Heo74321032019-11-04 15:54:30 -08005233static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5234 umode_t mode)
Paul Menageddbcc7e2007-10-18 23:39:30 -07005235{
Tejun Heoa5bca212016-03-03 09:57:58 -05005236 struct cgroup_root *root = parent->root;
Tejun Heoa5bca212016-03-03 09:57:58 -05005237 struct cgroup *cgrp, *tcgrp;
Tejun Heo74321032019-11-04 15:54:30 -08005238 struct kernfs_node *kn;
Tejun Heoa5bca212016-03-03 09:57:58 -05005239 int level = parent->level + 1;
Tejun Heo03970d32016-03-03 09:58:00 -05005240 int ret;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005241
Tejun Heo0a950f62012-11-19 09:02:12 -08005242 /* allocate the cgroup and its ID, 0 is reserved for the root */
Kees Cookacafe7e2018-05-08 13:45:50 -07005243 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5244 GFP_KERNEL);
Tejun Heoa5bca212016-03-03 09:57:58 -05005245 if (!cgrp)
5246 return ERR_PTR(-ENOMEM);
Li Zefan0ab02ca2014-02-11 16:05:46 +08005247
Tejun Heo2aad2a82014-09-24 13:31:50 -04005248 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
Tejun Heo9d755d32014-05-14 09:15:02 -04005249 if (ret)
5250 goto out_free_cgrp;
5251
Johannes Weinera7df69b2021-04-29 22:56:20 -07005252 ret = cgroup_rstat_init(cgrp);
5253 if (ret)
5254 goto out_cancel_ref;
Tejun Heo041cd642017-09-25 08:12:05 -07005255
Tejun Heo74321032019-11-04 15:54:30 -08005256 /* create the directory */
5257 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5258 if (IS_ERR(kn)) {
5259 ret = PTR_ERR(kn);
Tejun Heo041cd642017-09-25 08:12:05 -07005260 goto out_stat_exit;
Tejun Heo976c06b2012-11-05 09:16:59 -08005261 }
Tejun Heo74321032019-11-04 15:54:30 -08005262 cgrp->kn = kn;
Tejun Heo976c06b2012-11-05 09:16:59 -08005263
Paul Menagecc31edc2008-10-18 20:28:04 -07005264 init_cgroup_housekeeping(cgrp);
Paul Menageddbcc7e2007-10-18 23:39:30 -07005265
Tejun Heo9d800df2014-05-14 09:15:00 -04005266 cgrp->self.parent = &parent->self;
Tejun Heoba0f4d72014-05-13 12:19:22 -04005267 cgrp->root = root;
Tejun Heob11cfb52015-11-20 15:55:52 -05005268 cgrp->level = level;
Johannes Weiner2ce71352018-10-26 15:06:31 -07005269
5270 ret = psi_cgroup_alloc(cgrp);
Alexei Starovoitov324bda9e62017-10-02 22:50:21 -07005271 if (ret)
Tejun Heo74321032019-11-04 15:54:30 -08005272 goto out_kernfs_remove;
Tejun Heob11cfb52015-11-20 15:55:52 -05005273
Johannes Weiner2ce71352018-10-26 15:06:31 -07005274 ret = cgroup_bpf_inherit(cgrp);
5275 if (ret)
5276 goto out_psi_free;
5277
Roman Gushchin76f969e2019-04-19 10:03:04 -07005278 /*
5279 * New cgroup inherits effective freeze counter, and
5280 * if the parent has to be frozen, the child has too.
5281 */
5282 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
Roman Gushchin97a61362019-09-12 10:56:45 -07005283 if (cgrp->freezer.e_freeze) {
5284 /*
5285 * Set the CGRP_FREEZE flag, so when a process will be
5286 * attached to the child cgroup, it will become frozen.
5287 * At this point the new cgroup is unpopulated, so we can
5288 * consider it frozen immediately.
5289 */
5290 set_bit(CGRP_FREEZE, &cgrp->flags);
Roman Gushchin76f969e2019-04-19 10:03:04 -07005291 set_bit(CGRP_FROZEN, &cgrp->flags);
Roman Gushchin97a61362019-09-12 10:56:45 -07005292 }
Roman Gushchin76f969e2019-04-19 10:03:04 -07005293
Roman Gushchin4dcabec2019-04-19 10:03:03 -07005294 spin_lock_irq(&css_set_lock);
Roman Gushchin0679dee2017-08-02 17:55:29 +01005295 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
Tejun Heo74321032019-11-04 15:54:30 -08005296 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
Paul Menageddbcc7e2007-10-18 23:39:30 -07005297
Roman Gushchin76f969e2019-04-19 10:03:04 -07005298 if (tcgrp != cgrp) {
Roman Gushchin0679dee2017-08-02 17:55:29 +01005299 tcgrp->nr_descendants++;
Roman Gushchin76f969e2019-04-19 10:03:04 -07005300
5301 /*
5302 * If the new cgroup is frozen, all ancestor cgroups
5303 * get a new frozen descendant, but their state can't
5304 * change because of this.
5305 */
5306 if (cgrp->freezer.e_freeze)
5307 tcgrp->freezer.nr_frozen_descendants++;
5308 }
Roman Gushchin0679dee2017-08-02 17:55:29 +01005309 }
Roman Gushchin4dcabec2019-04-19 10:03:03 -07005310 spin_unlock_irq(&css_set_lock);
Roman Gushchin0679dee2017-08-02 17:55:29 +01005311
Li Zefanb6abdb02008-03-04 14:28:19 -08005312 if (notify_on_release(parent))
5313 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5314
Tejun Heo2260e7f2012-11-19 08:13:38 -08005315 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5316 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
Daniel Lezcano97978e62010-10-27 15:33:35 -07005317
Tejun Heo0cb51d72014-05-16 13:22:49 -04005318 cgrp->self.serial_nr = css_serial_nr_next++;
Tejun Heo53fa5262013-05-24 10:55:38 +09005319
Tejun Heo4e139af2012-11-19 08:13:36 -08005320 /* allocation complete, commit to creation */
Tejun Heod5c419b2014-05-16 13:22:48 -04005321 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
Tejun Heo3c9c8252014-02-12 09:29:50 -05005322 atomic_inc(&root->nr_cgrps);
Tejun Heoa590b902017-04-28 15:14:55 -04005323 cgroup_get_live(parent);
Li Zefan415cf072013-04-08 14:35:02 +08005324
Tejun Heo0d802552013-12-06 15:11:56 -05005325 /*
Tejun Heobd53d612014-04-23 11:13:16 -04005326 * On the default hierarchy, a child doesn't automatically inherit
Tejun Heo667c2492014-07-08 18:02:56 -04005327 * subtree_control from the parent. Each is configured manually.
Tejun Heobd53d612014-04-23 11:13:16 -04005328 */
Tejun Heo03970d32016-03-03 09:58:00 -05005329 if (!cgroup_on_dfl(cgrp))
Tejun Heo5531dc92016-03-03 09:57:58 -05005330 cgrp->subtree_control = cgroup_control(cgrp);
Tejun Heo03970d32016-03-03 09:58:00 -05005331
5332 cgroup_propagate_control(cgrp);
5333
Tejun Heoa5bca212016-03-03 09:57:58 -05005334 return cgrp;
5335
Johannes Weiner2ce71352018-10-26 15:06:31 -07005336out_psi_free:
5337 psi_cgroup_free(cgrp);
Tejun Heo74321032019-11-04 15:54:30 -08005338out_kernfs_remove:
5339 kernfs_remove(cgrp->kn);
Tejun Heo041cd642017-09-25 08:12:05 -07005340out_stat_exit:
Johannes Weinera7df69b2021-04-29 22:56:20 -07005341 cgroup_rstat_exit(cgrp);
Tejun Heoa5bca212016-03-03 09:57:58 -05005342out_cancel_ref:
5343 percpu_ref_exit(&cgrp->self.refcnt);
5344out_free_cgrp:
5345 kfree(cgrp);
5346 return ERR_PTR(ret);
Tejun Heoa5bca212016-03-03 09:57:58 -05005347}
5348
Roman Gushchin1a926e02017-07-28 18:28:44 +01005349static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5350{
5351 struct cgroup *cgroup;
5352 int ret = false;
5353 int level = 1;
5354
5355 lockdep_assert_held(&cgroup_mutex);
5356
5357 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5358 if (cgroup->nr_descendants >= cgroup->max_descendants)
5359 goto fail;
5360
5361 if (level > cgroup->max_depth)
5362 goto fail;
5363
5364 level++;
5365 }
5366
5367 ret = true;
5368fail:
5369 return ret;
5370}
5371
Tejun Heo1592c9b2016-12-27 14:49:08 -05005372int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
Tejun Heoa5bca212016-03-03 09:57:58 -05005373{
5374 struct cgroup *parent, *cgrp;
Tejun Heo03970d32016-03-03 09:58:00 -05005375 int ret;
Tejun Heoa5bca212016-03-03 09:57:58 -05005376
5377 /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
5378 if (strchr(name, '\n'))
5379 return -EINVAL;
5380
Tejun Heo945ba192016-03-03 09:58:00 -05005381 parent = cgroup_kn_lock_live(parent_kn, false);
Tejun Heoa5bca212016-03-03 09:57:58 -05005382 if (!parent)
5383 return -ENODEV;
5384
Roman Gushchin1a926e02017-07-28 18:28:44 +01005385 if (!cgroup_check_hierarchy_limits(parent)) {
5386 ret = -EAGAIN;
5387 goto out_unlock;
5388 }
5389
Tejun Heo74321032019-11-04 15:54:30 -08005390 cgrp = cgroup_create(parent, name, mode);
Tejun Heoa5bca212016-03-03 09:57:58 -05005391 if (IS_ERR(cgrp)) {
5392 ret = PTR_ERR(cgrp);
5393 goto out_unlock;
5394 }
5395
Tejun Heo195e9b6c2016-03-03 09:57:58 -05005396 /*
5397 * This extra ref will be put in cgroup_free_fn() and guarantees
5398 * that @cgrp->kn is always accessible.
5399 */
Tejun Heo74321032019-11-04 15:54:30 -08005400 kernfs_get(cgrp->kn);
Tejun Heo195e9b6c2016-03-03 09:57:58 -05005401
Tejun Heo74321032019-11-04 15:54:30 -08005402 ret = cgroup_kn_set_ugid(cgrp->kn);
Tejun Heo195e9b6c2016-03-03 09:57:58 -05005403 if (ret)
5404 goto out_destroy;
5405
Tejun Heo334c3672016-03-03 09:58:01 -05005406 ret = css_populate_dir(&cgrp->self);
Tejun Heo195e9b6c2016-03-03 09:57:58 -05005407 if (ret)
5408 goto out_destroy;
5409
Tejun Heo03970d32016-03-03 09:58:00 -05005410 ret = cgroup_apply_control_enable(cgrp);
5411 if (ret)
5412 goto out_destroy;
Tejun Heo195e9b6c2016-03-03 09:57:58 -05005413
Steven Rostedt (VMware)e4f8d812018-07-09 17:48:54 -04005414 TRACE_CGROUP_PATH(mkdir, cgrp);
Tejun Heoed1777d2016-08-10 11:23:44 -04005415
Tejun Heo195e9b6c2016-03-03 09:57:58 -05005416 /* let's create and online css's */
Tejun Heo74321032019-11-04 15:54:30 -08005417 kernfs_activate(cgrp->kn);
Tejun Heo2bd59d42014-02-11 11:52:49 -05005418
Tejun Heoba0f4d72014-05-13 12:19:22 -04005419 ret = 0;
5420 goto out_unlock;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005421
Tejun Heoa5bca212016-03-03 09:57:58 -05005422out_destroy:
5423 cgroup_destroy_locked(cgrp);
Tejun Heoba0f4d72014-05-13 12:19:22 -04005424out_unlock:
Tejun Heoa9746d82014-05-13 12:19:22 -04005425 cgroup_kn_unlock(parent_kn);
Tejun Heoe1b2dc12014-03-20 11:10:15 -04005426 return ret;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005427}
5428
Tejun Heo223dbc32013-08-13 20:22:50 -04005429/*
5430 * This is called when the refcnt of a css is confirmed to be killed.
Tejun Heo249f3462014-05-14 09:15:01 -04005431 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
5432 * initate destruction and put the css ref from kill_css().
Tejun Heo223dbc32013-08-13 20:22:50 -04005433 */
5434static void css_killed_work_fn(struct work_struct *work)
Tejun Heod3daf282013-06-13 19:39:16 -07005435{
Tejun Heo223dbc32013-08-13 20:22:50 -04005436 struct cgroup_subsys_state *css =
5437 container_of(work, struct cgroup_subsys_state, destroy_work);
Tejun Heod3daf282013-06-13 19:39:16 -07005438
Tejun Heof20104d2013-08-13 20:22:50 -04005439 mutex_lock(&cgroup_mutex);
Tejun Heo09a503ea2013-08-13 20:22:50 -04005440
Tejun Heoaa226ff2016-01-21 15:31:11 -05005441 do {
5442 offline_css(css);
5443 css_put(css);
5444 /* @css can't go away while we're holding cgroup_mutex */
5445 css = css->parent;
5446 } while (css && atomic_dec_and_test(&css->online_cnt));
5447
5448 mutex_unlock(&cgroup_mutex);
Tejun Heod3daf282013-06-13 19:39:16 -07005449}
5450
Tejun Heo223dbc32013-08-13 20:22:50 -04005451/* css kill confirmation processing requires process context, bounce */
5452static void css_killed_ref_fn(struct percpu_ref *ref)
Tejun Heod3daf282013-06-13 19:39:16 -07005453{
5454 struct cgroup_subsys_state *css =
5455 container_of(ref, struct cgroup_subsys_state, refcnt);
5456
Tejun Heoaa226ff2016-01-21 15:31:11 -05005457 if (atomic_dec_and_test(&css->online_cnt)) {
5458 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5459 queue_work(cgroup_destroy_wq, &css->destroy_work);
5460 }
Tejun Heod3daf282013-06-13 19:39:16 -07005461}
5462
Tejun Heof392e512014-04-23 11:13:14 -04005463/**
5464 * kill_css - destroy a css
5465 * @css: css to destroy
5466 *
5467 * This function initiates destruction of @css by removing cgroup interface
5468 * files and putting its base reference. ->css_offline() will be invoked
Tejun Heoec903c02014-05-13 12:11:01 -04005469 * asynchronously once css_tryget_online() is guaranteed to fail and when
5470 * the reference count reaches zero, @css will be released.
Tejun Heof392e512014-04-23 11:13:14 -04005471 */
5472static void kill_css(struct cgroup_subsys_state *css)
Tejun Heoedae0c32013-08-13 20:22:51 -04005473{
Tejun Heo01f64742014-05-13 12:19:23 -04005474 lockdep_assert_held(&cgroup_mutex);
Tejun Heo94419622014-03-19 10:23:54 -04005475
Waiman Long33c35aa2017-05-15 09:34:06 -04005476 if (css->flags & CSS_DYING)
5477 return;
5478
5479 css->flags |= CSS_DYING;
5480
Tejun Heo2bd59d42014-02-11 11:52:49 -05005481 /*
5482 * This must happen before css is disassociated with its cgroup.
5483 * See seq_css() for details.
5484 */
Tejun Heo334c3672016-03-03 09:58:01 -05005485 css_clear_dir(css);
Tejun Heo3c14f8b2013-08-13 20:22:51 -04005486
Tejun Heoedae0c32013-08-13 20:22:51 -04005487 /*
5488 * Killing would put the base ref, but we need to keep it alive
5489 * until after ->css_offline().
5490 */
5491 css_get(css);
5492
5493 /*
5494 * cgroup core guarantees that, by the time ->css_offline() is
5495 * invoked, no new css reference will be given out via
Tejun Heoec903c02014-05-13 12:11:01 -04005496 * css_tryget_online(). We can't simply call percpu_ref_kill() and
Tejun Heoedae0c32013-08-13 20:22:51 -04005497 * proceed to offlining css's because percpu_ref_kill() doesn't
5498 * guarantee that the ref is seen as killed on all CPUs on return.
5499 *
5500 * Use percpu_ref_kill_and_confirm() to get notifications as each
5501 * css is confirmed to be seen as killed on all CPUs.
5502 */
5503 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
Tejun Heod3daf282013-06-13 19:39:16 -07005504}
5505
5506/**
5507 * cgroup_destroy_locked - the first stage of cgroup destruction
5508 * @cgrp: cgroup to be destroyed
5509 *
5510 * css's make use of percpu refcnts whose killing latency shouldn't be
5511 * exposed to userland and are RCU protected. Also, cgroup core needs to
Tejun Heoec903c02014-05-13 12:11:01 -04005512 * guarantee that css_tryget_online() won't succeed by the time
5513 * ->css_offline() is invoked. To satisfy all the requirements,
5514 * destruction is implemented in the following two steps.
Tejun Heod3daf282013-06-13 19:39:16 -07005515 *
5516 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
5517 * userland visible parts and start killing the percpu refcnts of
5518 * css's. Set up so that the next stage will be kicked off once all
5519 * the percpu refcnts are confirmed to be killed.
5520 *
5521 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
5522 * rest of destruction. Once all cgroup references are gone, the
5523 * cgroup is RCU-freed.
5524 *
5525 * This function implements s1. After this step, @cgrp is gone as far as
5526 * the userland is concerned and a new cgroup with the same name may be
5527 * created. As cgroup doesn't care about the names internally, this
5528 * doesn't cause any problem.
5529 */
Tejun Heo42809dd2012-11-19 08:13:37 -08005530static int cgroup_destroy_locked(struct cgroup *cgrp)
5531 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
Paul Menageddbcc7e2007-10-18 23:39:30 -07005532{
Roman Gushchin0679dee2017-08-02 17:55:29 +01005533 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
Tejun Heo2bd59d42014-02-11 11:52:49 -05005534 struct cgroup_subsys_state *css;
Tejun Heo2b021cb2016-03-15 20:43:04 -04005535 struct cgrp_cset_link *link;
Tejun Heo1c6727a2013-12-06 15:11:56 -05005536 int ssid;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005537
Tejun Heo42809dd2012-11-19 08:13:37 -08005538 lockdep_assert_held(&cgroup_mutex);
5539
Tejun Heo91486f62015-10-15 16:41:51 -04005540 /*
5541 * Only migration can raise populated from zero and we're already
5542 * holding cgroup_mutex.
5543 */
5544 if (cgroup_is_populated(cgrp))
Paul Menageddbcc7e2007-10-18 23:39:30 -07005545 return -EBUSY;
Tejun Heoed9577932012-11-05 09:16:58 -08005546
Tejun Heo1a90dd52012-11-05 09:16:59 -08005547 /*
Tejun Heod5c419b2014-05-16 13:22:48 -04005548 * Make sure there's no live children. We can't test emptiness of
5549 * ->self.children as dead children linger on it while being
5550 * drained; otherwise, "rmdir parent/child parent" may fail.
Hugh Dickinsbb78a922013-08-28 16:31:23 -07005551 */
Tejun Heof3d46502014-05-16 13:22:52 -04005552 if (css_has_online_children(&cgrp->self))
Hugh Dickinsbb78a922013-08-28 16:31:23 -07005553 return -EBUSY;
5554
5555 /*
Tejun Heo2b021cb2016-03-15 20:43:04 -04005556 * Mark @cgrp and the associated csets dead. The former prevents
5557 * further task migration and child creation by disabling
5558 * cgroup_lock_live_group(). The latter makes the csets ignored by
5559 * the migration path.
Tejun Heo455050d2013-06-13 19:27:41 -07005560 */
Tejun Heo184faf32014-05-16 13:22:51 -04005561 cgrp->self.flags &= ~CSS_ONLINE;
Tejun Heo1a90dd52012-11-05 09:16:59 -08005562
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03005563 spin_lock_irq(&css_set_lock);
Tejun Heo2b021cb2016-03-15 20:43:04 -04005564 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5565 link->cset->dead = true;
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03005566 spin_unlock_irq(&css_set_lock);
Tejun Heo2b021cb2016-03-15 20:43:04 -04005567
Tejun Heo249f3462014-05-14 09:15:01 -04005568 /* initiate massacre of all css's */
Tejun Heo1a90dd52012-11-05 09:16:59 -08005569 for_each_css(css, ssid, cgrp)
Tejun Heo455050d2013-06-13 19:27:41 -07005570 kill_css(css);
5571
Tejun Heo5faaf052018-04-26 14:29:04 -07005572 /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
5573 css_clear_dir(&cgrp->self);
Tejun Heo01f64742014-05-13 12:19:23 -04005574 kernfs_remove(cgrp->kn);
Tejun Heof20104d2013-08-13 20:22:50 -04005575
Tejun Heo454000a2017-05-15 09:34:02 -04005576 if (parent && cgroup_is_threaded(cgrp))
5577 parent->nr_threaded_children--;
5578
Roman Gushchin4dcabec2019-04-19 10:03:03 -07005579 spin_lock_irq(&css_set_lock);
Roman Gushchin0679dee2017-08-02 17:55:29 +01005580 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5581 tcgrp->nr_descendants--;
5582 tcgrp->nr_dying_descendants++;
Roman Gushchin76f969e2019-04-19 10:03:04 -07005583 /*
5584 * If the dying cgroup is frozen, decrease frozen descendants
5585 * counters of ancestor cgroups.
5586 */
5587 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5588 tcgrp->freezer.nr_frozen_descendants--;
Roman Gushchin0679dee2017-08-02 17:55:29 +01005589 }
Roman Gushchin4dcabec2019-04-19 10:03:03 -07005590 spin_unlock_irq(&css_set_lock);
Roman Gushchin0679dee2017-08-02 17:55:29 +01005591
Roman Gushchin5a621e62017-08-02 17:55:32 +01005592 cgroup1_check_for_release(parent);
Tejun Heo2bd59d42014-02-11 11:52:49 -05005593
Roman Gushchin4bfc0bb2019-05-25 09:37:39 -07005594 cgroup_bpf_offline(cgrp);
5595
Tejun Heo249f3462014-05-14 09:15:01 -04005596 /* put the base reference */
Tejun Heo9d755d32014-05-14 09:15:02 -04005597 percpu_ref_kill(&cgrp->self.refcnt);
Tejun Heo455050d2013-06-13 19:27:41 -07005598
Tejun Heoea15f8c2013-06-13 19:27:42 -07005599 return 0;
5600};
5601
Tejun Heo1592c9b2016-12-27 14:49:08 -05005602int cgroup_rmdir(struct kernfs_node *kn)
Tejun Heo42809dd2012-11-19 08:13:37 -08005603{
Tejun Heoa9746d82014-05-13 12:19:22 -04005604 struct cgroup *cgrp;
Tejun Heo2bd59d42014-02-11 11:52:49 -05005605 int ret = 0;
Tejun Heo42809dd2012-11-19 08:13:37 -08005606
Tejun Heo945ba192016-03-03 09:58:00 -05005607 cgrp = cgroup_kn_lock_live(kn, false);
Tejun Heoa9746d82014-05-13 12:19:22 -04005608 if (!cgrp)
5609 return 0;
Tejun Heo42809dd2012-11-19 08:13:37 -08005610
Tejun Heoa9746d82014-05-13 12:19:22 -04005611 ret = cgroup_destroy_locked(cgrp);
Tejun Heoed1777d2016-08-10 11:23:44 -04005612 if (!ret)
Steven Rostedt (VMware)e4f8d812018-07-09 17:48:54 -04005613 TRACE_CGROUP_PATH(rmdir, cgrp);
Tejun Heoed1777d2016-08-10 11:23:44 -04005614
Tejun Heoa9746d82014-05-13 12:19:22 -04005615 cgroup_kn_unlock(kn);
Tejun Heo42809dd2012-11-19 08:13:37 -08005616 return ret;
5617}
5618
Tejun Heo2bd59d42014-02-11 11:52:49 -05005619static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
Tejun Heo5136f632017-06-27 14:30:28 -04005620 .show_options = cgroup_show_options,
Tejun Heo2bd59d42014-02-11 11:52:49 -05005621 .mkdir = cgroup_mkdir,
5622 .rmdir = cgroup_rmdir,
Serge E. Hallyn4f41fc52016-05-09 09:59:55 -05005623 .show_path = cgroup_show_path,
Tejun Heo2bd59d42014-02-11 11:52:49 -05005624};
Tejun Heo8e3f6542012-04-01 12:09:55 -07005625
Tejun Heo15a4c832014-05-04 15:09:14 -04005626static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
Paul Menageddbcc7e2007-10-18 23:39:30 -07005627{
Paul Menageddbcc7e2007-10-18 23:39:30 -07005628 struct cgroup_subsys_state *css;
Diego Callejacfe36bd2007-11-14 16:58:54 -08005629
Tejun Heoa5ae9892015-12-29 14:53:56 -05005630 pr_debug("Initializing cgroup subsys %s\n", ss->name);
Paul Menageddbcc7e2007-10-18 23:39:30 -07005631
Tejun Heo648bb562012-11-19 08:13:36 -08005632 mutex_lock(&cgroup_mutex);
5633
Tejun Heo15a4c832014-05-04 15:09:14 -04005634 idr_init(&ss->css_idr);
Tejun Heo0adb0702014-02-12 09:29:48 -05005635 INIT_LIST_HEAD(&ss->cfts);
Tejun Heo8e3f6542012-04-01 12:09:55 -07005636
Tejun Heo3dd06ff2014-03-19 10:23:54 -04005637 /* Create the root cgroup state for this subsystem */
5638 ss->root = &cgrp_dfl_root;
5639 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
Paul Menageddbcc7e2007-10-18 23:39:30 -07005640 /* We don't handle early failures gracefully */
5641 BUG_ON(IS_ERR(css));
Tejun Heoddfcada2014-05-04 15:09:14 -04005642 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
Tejun Heo3b514d22014-05-16 13:22:47 -04005643
5644 /*
5645 * Root csses are never destroyed and we can't initialize
5646 * percpu_ref during early init. Disable refcnting.
5647 */
5648 css->flags |= CSS_NO_REF;
5649
Tejun Heo15a4c832014-05-04 15:09:14 -04005650 if (early) {
Tejun Heo9395a452014-05-14 09:15:02 -04005651 /* allocation can't be done safely during early init */
Tejun Heo15a4c832014-05-04 15:09:14 -04005652 css->id = 1;
5653 } else {
5654 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5655 BUG_ON(css->id < 0);
5656 }
Paul Menageddbcc7e2007-10-18 23:39:30 -07005657
Li Zefane8d55fd2008-04-29 01:00:13 -07005658 /* Update the init_css_set to contain a subsys
Paul Menage817929e2007-10-18 23:39:36 -07005659 * pointer to this state - since the subsystem is
Li Zefane8d55fd2008-04-29 01:00:13 -07005660 * newly registered, all tasks and hence the
Tejun Heo3dd06ff2014-03-19 10:23:54 -04005661 * init_css_set is in the subsystem's root cgroup. */
Tejun Heoaec25022014-02-08 10:36:58 -05005662 init_css_set.subsys[ss->id] = css;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005663
Aleksa Saraicb4a3162015-06-06 10:02:14 +10005664 have_fork_callback |= (bool)ss->fork << ss->id;
5665 have_exit_callback |= (bool)ss->exit << ss->id;
Oleg Nesterov51bee5a2019-01-28 17:00:13 +01005666 have_release_callback |= (bool)ss->release << ss->id;
Aleksa Sarai7e476822015-06-09 21:32:09 +10005667 have_canfork_callback |= (bool)ss->can_fork << ss->id;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005668
Li Zefane8d55fd2008-04-29 01:00:13 -07005669 /* At system boot, before all subsystems have been
5670 * registered, no tasks have been forked, so we don't
5671 * need to invoke fork callbacks here. */
5672 BUG_ON(!list_empty(&init_task.tasks));
5673
Tejun Heoae7f1642013-08-13 20:22:50 -04005674 BUG_ON(online_css(css));
Tejun Heoa8638032012-11-09 09:12:29 -08005675
Tejun Heo648bb562012-11-19 08:13:36 -08005676 mutex_unlock(&cgroup_mutex);
Paul Menageddbcc7e2007-10-18 23:39:30 -07005677}
5678
5679/**
Li Zefana043e3b2008-02-23 15:24:09 -08005680 * cgroup_init_early - cgroup initialization at system boot
5681 *
5682 * Initialize cgroups at system boot, and initialize any
5683 * subsystems that request early init.
Paul Menageddbcc7e2007-10-18 23:39:30 -07005684 */
5685int __init cgroup_init_early(void)
5686{
Al Virof5dfb532019-01-16 23:42:38 -05005687 static struct cgroup_fs_context __initdata ctx;
Tejun Heo30159ec2013-06-25 11:53:37 -07005688 struct cgroup_subsys *ss;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005689 int i;
Tejun Heo30159ec2013-06-25 11:53:37 -07005690
Al Virocf6299b12019-01-17 02:25:51 -05005691 ctx.root = &cgrp_dfl_root;
5692 init_cgroup_root(&ctx);
Tejun Heo3b514d22014-05-16 13:22:47 -04005693 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5694
Tejun Heoa4ea1cc2013-06-21 15:52:33 -07005695 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
Paul Menage817929e2007-10-18 23:39:36 -07005696
Tejun Heo3ed80a62014-02-08 10:36:58 -05005697 for_each_subsys(ss, i) {
Tejun Heoaec25022014-02-08 10:36:58 -05005698 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
Xiubo Li63253ad2016-02-26 13:07:38 +08005699 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
Tejun Heo073219e2014-02-08 10:36:58 -05005700 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
Tejun Heoaec25022014-02-08 10:36:58 -05005701 ss->id, ss->name);
Tejun Heo073219e2014-02-08 10:36:58 -05005702 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5703 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
Paul Menageddbcc7e2007-10-18 23:39:30 -07005704
Tejun Heoaec25022014-02-08 10:36:58 -05005705 ss->id = i;
Tejun Heo073219e2014-02-08 10:36:58 -05005706 ss->name = cgroup_subsys_name[i];
Tejun Heo3e1d2ee2015-08-18 13:58:16 -07005707 if (!ss->legacy_name)
5708 ss->legacy_name = cgroup_subsys_name[i];
Paul Menageddbcc7e2007-10-18 23:39:30 -07005709
5710 if (ss->early_init)
Tejun Heo15a4c832014-05-04 15:09:14 -04005711 cgroup_init_subsys(ss, true);
Paul Menageddbcc7e2007-10-18 23:39:30 -07005712 }
5713 return 0;
5714}
5715
Tejun Heo6e5c8302016-02-22 22:25:47 -05005716static u16 cgroup_disable_mask __initdata;
Tejun Heoa3e72732015-09-25 16:24:27 -04005717
Paul Menageddbcc7e2007-10-18 23:39:30 -07005718/**
Li Zefana043e3b2008-02-23 15:24:09 -08005719 * cgroup_init - cgroup initialization
5720 *
5721 * Register cgroup filesystem and /proc file, and initialize
5722 * any subsystems that didn't request early init.
Paul Menageddbcc7e2007-10-18 23:39:30 -07005723 */
5724int __init cgroup_init(void)
5725{
Tejun Heo30159ec2013-06-25 11:53:37 -07005726 struct cgroup_subsys *ss;
Tejun Heo035f4f52015-10-15 17:00:43 -04005727 int ssid;
Paul Menagea4243162007-10-18 23:39:35 -07005728
Tejun Heo6e5c8302016-02-22 22:25:47 -05005729 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
Tejun Heod62beb72016-12-27 14:49:08 -05005730 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5731 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
Paul Menageddbcc7e2007-10-18 23:39:30 -07005732
Tejun Heoc58632b2018-04-26 14:29:04 -07005733 cgroup_rstat_boot();
Tejun Heo041cd642017-09-25 08:12:05 -07005734
Peter Zijlstra3942a9b2016-08-11 18:54:13 +02005735 /*
Paul E. McKenney2af30242018-11-07 14:11:40 -08005736 * The latency of the synchronize_rcu() is too high for cgroups,
Peter Zijlstra3942a9b2016-08-11 18:54:13 +02005737 * avoid it at the cost of forcing all readers into the slow path.
5738 */
5739 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5740
Aditya Kalia79a9082016-01-29 02:54:06 -06005741 get_user_ns(init_cgroup_ns.user_ns);
5742
Tejun Heo54e7b4e2013-04-14 11:36:57 -07005743 mutex_lock(&cgroup_mutex);
Tejun Heo54e7b4e2013-04-14 11:36:57 -07005744
Tejun Heo2378d8b2016-03-03 09:57:57 -05005745 /*
5746 * Add init_css_set to the hash table so that dfl_root can link to
5747 * it during init.
5748 */
5749 hash_add(css_set_table, &init_css_set.hlist,
5750 css_set_hash(init_css_set.subsys));
Tejun Heo82fe9b02013-06-25 11:53:37 -07005751
Al Viro35ac1182019-01-12 00:20:54 -05005752 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
Greg KH676db4a2010-08-05 13:53:35 -07005753
Tejun Heo54e7b4e2013-04-14 11:36:57 -07005754 mutex_unlock(&cgroup_mutex);
5755
Tejun Heo172a2c062014-03-19 10:23:53 -04005756 for_each_subsys(ss, ssid) {
Tejun Heo15a4c832014-05-04 15:09:14 -04005757 if (ss->early_init) {
5758 struct cgroup_subsys_state *css =
5759 init_css_set.subsys[ss->id];
5760
5761 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5762 GFP_KERNEL);
5763 BUG_ON(css->id < 0);
5764 } else {
5765 cgroup_init_subsys(ss, false);
5766 }
Tejun Heo172a2c062014-03-19 10:23:53 -04005767
Tejun Heo2d8f2432014-04-23 11:13:15 -04005768 list_add_tail(&init_css_set.e_cset_node[ssid],
5769 &cgrp_dfl_root.cgrp.e_csets[ssid]);
Tejun Heo172a2c062014-03-19 10:23:53 -04005770
5771 /*
Li Zefanc731ae12014-06-05 17:16:30 +08005772 * Setting dfl_root subsys_mask needs to consider the
5773 * disabled flag and cftype registration needs kmalloc,
5774 * both of which aren't available during early_init.
Tejun Heo172a2c062014-03-19 10:23:53 -04005775 */
Tejun Heoa3e72732015-09-25 16:24:27 -04005776 if (cgroup_disable_mask & (1 << ssid)) {
5777 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5778 printk(KERN_INFO "Disabling %s control group subsystem\n",
5779 ss->name);
Tejun Heoa8ddc822014-07-15 11:05:10 -04005780 continue;
Tejun Heoa3e72732015-09-25 16:24:27 -04005781 }
Tejun Heoa8ddc822014-07-15 11:05:10 -04005782
Tejun Heod62beb72016-12-27 14:49:08 -05005783 if (cgroup1_ssid_disabled(ssid))
Johannes Weiner223ffb22016-02-11 13:34:49 -05005784 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5785 ss->name);
5786
Tejun Heoa8ddc822014-07-15 11:05:10 -04005787 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5788
Tejun Heo8cfd8142017-07-21 11:14:51 -04005789 /* implicit controllers must be threaded too */
5790 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5791
Tejun Heof6d635ad2016-03-08 11:51:26 -05005792 if (ss->implicit_on_dfl)
5793 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5794 else if (!ss->dfl_cftypes)
Tejun Heoa7165262016-02-23 10:00:50 -05005795 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
Tejun Heo5de4fa12014-07-15 11:05:10 -04005796
Tejun Heo8cfd8142017-07-21 11:14:51 -04005797 if (ss->threaded)
5798 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5799
Tejun Heoa8ddc822014-07-15 11:05:10 -04005800 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5801 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5802 } else {
5803 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5804 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
Li Zefanc731ae12014-06-05 17:16:30 +08005805 }
Vladimir Davydov295458e2015-02-19 17:34:46 +03005806
5807 if (ss->bind)
5808 ss->bind(init_css_set.subsys[ssid]);
Tejun Heo7af608e2017-07-18 17:57:46 -04005809
5810 mutex_lock(&cgroup_mutex);
5811 css_populate_dir(init_css_set.subsys[ssid]);
5812 mutex_unlock(&cgroup_mutex);
Tejun Heo172a2c062014-03-19 10:23:53 -04005813 }
Greg KH676db4a2010-08-05 13:53:35 -07005814
Tejun Heo2378d8b2016-03-03 09:57:57 -05005815 /* init_css_set.subsys[] has been updated, re-hash */
5816 hash_del(&init_css_set.hlist);
5817 hash_add(css_set_table, &init_css_set.hlist,
5818 css_set_hash(init_css_set.subsys));
5819
Tejun Heo035f4f52015-10-15 17:00:43 -04005820 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5821 WARN_ON(register_filesystem(&cgroup_fs_type));
Tejun Heo67e9c742015-11-16 11:13:34 -05005822 WARN_ON(register_filesystem(&cgroup2_fs_type));
Christoph Hellwig3f3942a2018-05-15 15:57:23 +02005823 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
Al Virod5f68d32019-05-13 12:33:22 -04005824#ifdef CONFIG_CPUSETS
5825 WARN_ON(register_filesystem(&cpuset_fs_type));
5826#endif
Paul Menagea4243162007-10-18 23:39:35 -07005827
Tejun Heo2bd59d42014-02-11 11:52:49 -05005828 return 0;
Paul Menageddbcc7e2007-10-18 23:39:30 -07005829}
Paul Menageb4f48b62007-10-18 23:39:33 -07005830
Tejun Heoe5fca242013-11-22 17:14:39 -05005831static int __init cgroup_wq_init(void)
5832{
5833 /*
5834 * There isn't much point in executing destruction path in
5835 * parallel. Good chunk is serialized with cgroup_mutex anyway.
Tejun Heo1a115332014-02-12 19:06:19 -05005836 * Use 1 for @max_active.
Tejun Heoe5fca242013-11-22 17:14:39 -05005837 *
5838 * We would prefer to do this in cgroup_init() above, but that
5839 * is called before init_workqueues(): so leave this until after.
5840 */
Tejun Heo1a115332014-02-12 19:06:19 -05005841 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
Tejun Heoe5fca242013-11-22 17:14:39 -05005842 BUG_ON(!cgroup_destroy_wq);
5843 return 0;
5844}
5845core_initcall(cgroup_wq_init);
5846
Tejun Heo67c04962019-11-04 15:54:30 -08005847void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
Shaohua Li69fd5c32017-07-12 11:49:55 -07005848{
5849 struct kernfs_node *kn;
5850
Tejun Heofe0f7262019-11-04 15:54:30 -08005851 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
Shaohua Li69fd5c32017-07-12 11:49:55 -07005852 if (!kn)
5853 return;
5854 kernfs_path(kn, buf, buflen);
5855 kernfs_put(kn);
5856}
5857
Paul Menagea4243162007-10-18 23:39:35 -07005858/*
5859 * proc_cgroup_show()
5860 * - Print task's cgroup paths into seq_file, one line for each hierarchy
5861 * - Used for /proc/<pid>/cgroup.
Paul Menagea4243162007-10-18 23:39:35 -07005862 */
Zefan Li006f4ac2014-09-18 16:03:15 +08005863int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5864 struct pid *pid, struct task_struct *tsk)
Paul Menagea4243162007-10-18 23:39:35 -07005865{
Tejun Heo4c737b42016-08-10 11:23:44 -04005866 char *buf;
Paul Menagea4243162007-10-18 23:39:35 -07005867 int retval;
Tejun Heo3dd06ff2014-03-19 10:23:54 -04005868 struct cgroup_root *root;
Paul Menagea4243162007-10-18 23:39:35 -07005869
5870 retval = -ENOMEM;
Tejun Heoe61734c2014-02-12 09:29:50 -05005871 buf = kmalloc(PATH_MAX, GFP_KERNEL);
Paul Menagea4243162007-10-18 23:39:35 -07005872 if (!buf)
5873 goto out;
5874
Paul Menagea4243162007-10-18 23:39:35 -07005875 mutex_lock(&cgroup_mutex);
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03005876 spin_lock_irq(&css_set_lock);
Paul Menagea4243162007-10-18 23:39:35 -07005877
Tejun Heo985ed672014-03-19 10:23:53 -04005878 for_each_root(root) {
Paul Menagea4243162007-10-18 23:39:35 -07005879 struct cgroup_subsys *ss;
Paul Menagebd89aab2007-10-18 23:40:44 -07005880 struct cgroup *cgrp;
Tejun Heob85d2042013-12-06 15:11:57 -05005881 int ssid, count = 0;
Paul Menagea4243162007-10-18 23:39:35 -07005882
Tejun Heoa7165262016-02-23 10:00:50 -05005883 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
Tejun Heo985ed672014-03-19 10:23:53 -04005884 continue;
5885
Paul Menage2c6ab6d2009-09-23 15:56:23 -07005886 seq_printf(m, "%d:", root->hierarchy_id);
Tejun Heod98817d2015-08-18 13:58:16 -07005887 if (root != &cgrp_dfl_root)
5888 for_each_subsys(ss, ssid)
5889 if (root->subsys_mask & (1 << ssid))
5890 seq_printf(m, "%s%s", count++ ? "," : "",
Tejun Heo3e1d2ee2015-08-18 13:58:16 -07005891 ss->legacy_name);
Paul Menagec6d57f32009-09-23 15:56:19 -07005892 if (strlen(root->name))
5893 seq_printf(m, "%sname=%s", count ? "," : "",
5894 root->name);
Paul Menagea4243162007-10-18 23:39:35 -07005895 seq_putc(m, ':');
Tejun Heo2e91fa72015-10-15 16:41:53 -04005896
Paul Menage7717f7b2009-09-23 15:56:22 -07005897 cgrp = task_cgroup_from_root(tsk, root);
Tejun Heo2e91fa72015-10-15 16:41:53 -04005898
5899 /*
5900 * On traditional hierarchies, all zombie tasks show up as
5901 * belonging to the root cgroup. On the default hierarchy,
5902 * while a zombie doesn't show up in "cgroup.procs" and
5903 * thus can't be migrated, its /proc/PID/cgroup keeps
5904 * reporting the cgroup it belonged to before exiting. If
5905 * the cgroup is removed before the zombie is reaped,
5906 * " (deleted)" is appended to the cgroup path.
5907 */
5908 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
Tejun Heo4c737b42016-08-10 11:23:44 -04005909 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
Aditya Kalia79a9082016-01-29 02:54:06 -06005910 current->nsproxy->cgroup_ns);
Tejun Heoe0223002016-09-29 15:49:40 +02005911 if (retval >= PATH_MAX)
Tejun Heo2e91fa72015-10-15 16:41:53 -04005912 retval = -ENAMETOOLONG;
Tejun Heoe0223002016-09-29 15:49:40 +02005913 if (retval < 0)
Tejun Heo2e91fa72015-10-15 16:41:53 -04005914 goto out_unlock;
Tejun Heo2e91fa72015-10-15 16:41:53 -04005915
Tejun Heo4c737b42016-08-10 11:23:44 -04005916 seq_puts(m, buf);
5917 } else {
5918 seq_puts(m, "/");
5919 }
Tejun Heo2e91fa72015-10-15 16:41:53 -04005920
5921 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5922 seq_puts(m, " (deleted)\n");
5923 else
5924 seq_putc(m, '\n');
Paul Menagea4243162007-10-18 23:39:35 -07005925 }
5926
Zefan Li006f4ac2014-09-18 16:03:15 +08005927 retval = 0;
Paul Menagea4243162007-10-18 23:39:35 -07005928out_unlock:
Daniel Bristot de Oliveira82d64892016-06-22 17:28:41 -03005929 spin_unlock_irq(&css_set_lock);
Paul Menagea4243162007-10-18 23:39:35 -07005930 mutex_unlock(&cgroup_mutex);
Paul Menagea4243162007-10-18 23:39:35 -07005931 kfree(buf);
5932out:
5933 return retval;
5934}
5935
Paul Menageb4f48b62007-10-18 23:39:33 -07005936/**
Tejun Heoeaf797a2014-02-25 10:04:03 -05005937 * cgroup_fork - initialize cgroup related fields during copy_process()
Li Zefana043e3b2008-02-23 15:24:09 -08005938 * @child: pointer to task_struct of forking parent process.
Paul Menageb4f48b62007-10-18 23:39:33 -07005939 *
Tejun Heoeaf797a2014-02-25 10:04:03 -05005940 * A task is associated with the init_css_set until cgroup_post_fork()
Christian Brauneref2c41c2020-02-05 14:26:22 +01005941 * attaches it to the target css_set.
Paul Menageb4f48b62007-10-18 23:39:33 -07005942 */
5943void cgroup_fork(struct task_struct *child)
5944{
Tejun Heoeaf797a2014-02-25 10:04:03 -05005945 RCU_INIT_POINTER(child->cgroups, &init_css_set);
Paul Menage817929e2007-10-18 23:39:36 -07005946 INIT_LIST_HEAD(&child->cg_list);
Paul Menageb4f48b62007-10-18 23:39:33 -07005947}
5948
Christian Brauner17703092020-02-05 14:26:19 +01005949static struct cgroup *cgroup_get_from_file(struct file *f)
5950{
5951 struct cgroup_subsys_state *css;
5952 struct cgroup *cgrp;
5953
5954 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
5955 if (IS_ERR(css))
5956 return ERR_CAST(css);
5957
5958 cgrp = css->cgroup;
5959 if (!cgroup_on_dfl(cgrp)) {
5960 cgroup_put(cgrp);
5961 return ERR_PTR(-EBADF);
5962 }
5963
5964 return cgrp;
5965}
5966
Paul Menageb4f48b62007-10-18 23:39:33 -07005967/**
Christian Brauneref2c41c2020-02-05 14:26:22 +01005968 * cgroup_css_set_fork - find or create a css_set for a child process
5969 * @kargs: the arguments passed to create the child process
5970 *
5971 * This functions finds or creates a new css_set which the child
5972 * process will be attached to in cgroup_post_fork(). By default,
5973 * the child process will be given the same css_set as its parent.
5974 *
5975 * If CLONE_INTO_CGROUP is specified this function will try to find an
5976 * existing css_set which includes the requested cgroup and if not create
5977 * a new css_set that the child will be attached to later. If this function
5978 * succeeds it will hold cgroup_threadgroup_rwsem on return. If
5979 * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
5980 * before grabbing cgroup_threadgroup_rwsem and will hold a reference
5981 * to the target cgroup.
5982 */
5983static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
5984 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
5985{
5986 int ret;
5987 struct cgroup *dst_cgrp = NULL;
5988 struct css_set *cset;
5989 struct super_block *sb;
5990 struct file *f;
5991
5992 if (kargs->flags & CLONE_INTO_CGROUP)
5993 mutex_lock(&cgroup_mutex);
5994
5995 cgroup_threadgroup_change_begin(current);
5996
5997 spin_lock_irq(&css_set_lock);
5998 cset = task_css_set(current);
5999 get_css_set(cset);
6000 spin_unlock_irq(&css_set_lock);
6001
6002 if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6003 kargs->cset = cset;
6004 return 0;
6005 }
6006
6007 f = fget_raw(kargs->cgroup);
6008 if (!f) {
6009 ret = -EBADF;
6010 goto err;
6011 }
6012 sb = f->f_path.dentry->d_sb;
6013
6014 dst_cgrp = cgroup_get_from_file(f);
6015 if (IS_ERR(dst_cgrp)) {
6016 ret = PTR_ERR(dst_cgrp);
6017 dst_cgrp = NULL;
6018 goto err;
6019 }
6020
6021 if (cgroup_is_dead(dst_cgrp)) {
6022 ret = -ENODEV;
6023 goto err;
6024 }
6025
6026 /*
6027 * Verify that we the target cgroup is writable for us. This is
6028 * usually done by the vfs layer but since we're not going through
6029 * the vfs layer here we need to do it "manually".
6030 */
6031 ret = cgroup_may_write(dst_cgrp, sb);
6032 if (ret)
6033 goto err;
6034
6035 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6036 !(kargs->flags & CLONE_THREAD));
6037 if (ret)
6038 goto err;
6039
6040 kargs->cset = find_css_set(cset, dst_cgrp);
6041 if (!kargs->cset) {
6042 ret = -ENOMEM;
6043 goto err;
6044 }
6045
6046 put_css_set(cset);
6047 fput(f);
6048 kargs->cgrp = dst_cgrp;
6049 return ret;
6050
6051err:
6052 cgroup_threadgroup_change_end(current);
6053 mutex_unlock(&cgroup_mutex);
6054 if (f)
6055 fput(f);
6056 if (dst_cgrp)
6057 cgroup_put(dst_cgrp);
6058 put_css_set(cset);
6059 if (kargs->cset)
6060 put_css_set(kargs->cset);
6061 return ret;
6062}
6063
6064/**
6065 * cgroup_css_set_put_fork - drop references we took during fork
6066 * @kargs: the arguments passed to create the child process
6067 *
6068 * Drop references to the prepared css_set and target cgroup if
6069 * CLONE_INTO_CGROUP was requested.
6070 */
6071static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6072 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6073{
6074 cgroup_threadgroup_change_end(current);
6075
6076 if (kargs->flags & CLONE_INTO_CGROUP) {
6077 struct cgroup *cgrp = kargs->cgrp;
6078 struct css_set *cset = kargs->cset;
6079
6080 mutex_unlock(&cgroup_mutex);
6081
6082 if (cset) {
6083 put_css_set(cset);
6084 kargs->cset = NULL;
6085 }
6086
6087 if (cgrp) {
6088 cgroup_put(cgrp);
6089 kargs->cgrp = NULL;
6090 }
6091 }
6092}
6093
Paul Menageb4f48b62007-10-18 23:39:33 -07006094/**
Aleksa Sarai7e476822015-06-09 21:32:09 +10006095 * cgroup_can_fork - called on a new task before the process is exposed
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006096 * @child: the child process
Aleksa Sarai7e476822015-06-09 21:32:09 +10006097 *
Christian Brauneref2c41c2020-02-05 14:26:22 +01006098 * This prepares a new css_set for the child process which the child will
6099 * be attached to in cgroup_post_fork().
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006100 * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
6101 * callback returns an error, the fork aborts with that error code. This
6102 * allows for a cgroup subsystem to conditionally allow or deny new forks.
Aleksa Sarai7e476822015-06-09 21:32:09 +10006103 */
Christian Brauneref2c41c2020-02-05 14:26:22 +01006104int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
Aleksa Sarai7e476822015-06-09 21:32:09 +10006105{
6106 struct cgroup_subsys *ss;
6107 int i, j, ret;
6108
Christian Brauneref2c41c2020-02-05 14:26:22 +01006109 ret = cgroup_css_set_fork(kargs);
6110 if (ret)
6111 return ret;
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006112
Tejun Heob4e0eea2016-02-22 22:25:46 -05006113 do_each_subsys_mask(ss, i, have_canfork_callback) {
Christian Brauneref2c41c2020-02-05 14:26:22 +01006114 ret = ss->can_fork(child, kargs->cset);
Aleksa Sarai7e476822015-06-09 21:32:09 +10006115 if (ret)
6116 goto out_revert;
Tejun Heob4e0eea2016-02-22 22:25:46 -05006117 } while_each_subsys_mask();
Aleksa Sarai7e476822015-06-09 21:32:09 +10006118
6119 return 0;
6120
6121out_revert:
6122 for_each_subsys(ss, j) {
6123 if (j >= i)
6124 break;
6125 if (ss->cancel_fork)
Christian Brauneref2c41c2020-02-05 14:26:22 +01006126 ss->cancel_fork(child, kargs->cset);
Aleksa Sarai7e476822015-06-09 21:32:09 +10006127 }
6128
Christian Brauneref2c41c2020-02-05 14:26:22 +01006129 cgroup_css_set_put_fork(kargs);
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006130
Aleksa Sarai7e476822015-06-09 21:32:09 +10006131 return ret;
6132}
6133
6134/**
6135 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
Christian Brauneref2c41c2020-02-05 14:26:22 +01006136 * @child: the child process
6137 * @kargs: the arguments passed to create the child process
Aleksa Sarai7e476822015-06-09 21:32:09 +10006138 *
6139 * This calls the cancel_fork() callbacks if a fork failed *after*
Christian Brauneref2c41c2020-02-05 14:26:22 +01006140 * cgroup_can_fork() succeded and cleans up references we took to
6141 * prepare a new css_set for the child process in cgroup_can_fork().
Aleksa Sarai7e476822015-06-09 21:32:09 +10006142 */
Christian Brauneref2c41c2020-02-05 14:26:22 +01006143void cgroup_cancel_fork(struct task_struct *child,
6144 struct kernel_clone_args *kargs)
Aleksa Sarai7e476822015-06-09 21:32:09 +10006145{
6146 struct cgroup_subsys *ss;
6147 int i;
6148
6149 for_each_subsys(ss, i)
6150 if (ss->cancel_fork)
Christian Brauneref2c41c2020-02-05 14:26:22 +01006151 ss->cancel_fork(child, kargs->cset);
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006152
Christian Brauneref2c41c2020-02-05 14:26:22 +01006153 cgroup_css_set_put_fork(kargs);
Aleksa Sarai7e476822015-06-09 21:32:09 +10006154}
6155
6156/**
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006157 * cgroup_post_fork - finalize cgroup setup for the child process
6158 * @child: the child process
Li Zefana043e3b2008-02-23 15:24:09 -08006159 *
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006160 * Attach the child process to its css_set calling the subsystem fork()
6161 * callbacks.
Li Zefana043e3b2008-02-23 15:24:09 -08006162 */
Christian Brauneref2c41c2020-02-05 14:26:22 +01006163void cgroup_post_fork(struct task_struct *child,
6164 struct kernel_clone_args *kargs)
6165 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
Paul Menage817929e2007-10-18 23:39:36 -07006166{
Christian Brauner661ee622021-05-08 14:15:38 +02006167 unsigned long cgrp_flags = 0;
6168 bool kill = false;
Tejun Heo30159ec2013-06-25 11:53:37 -07006169 struct cgroup_subsys *ss;
Tejun Heo5153faa2019-10-24 12:03:51 -07006170 struct css_set *cset;
Tejun Heo5edee612012-10-16 15:03:14 -07006171 int i;
6172
Christian Brauneref2c41c2020-02-05 14:26:22 +01006173 cset = kargs->cset;
6174 kargs->cset = NULL;
6175
Tejun Heo5153faa2019-10-24 12:03:51 -07006176 spin_lock_irq(&css_set_lock);
Tejun Heoeaf797a2014-02-25 10:04:03 -05006177
Tejun Heo0cd9d332020-01-30 11:37:33 -05006178 /* init tasks are special, only link regular threads */
6179 if (likely(child->pid)) {
Christian Brauner661ee622021-05-08 14:15:38 +02006180 if (kargs->cgrp)
6181 cgrp_flags = kargs->cgrp->flags;
6182 else
6183 cgrp_flags = cset->dfl_cgrp->flags;
6184
Tejun Heo0cd9d332020-01-30 11:37:33 -05006185 WARN_ON_ONCE(!list_empty(&child->cg_list));
Tejun Heo0cd9d332020-01-30 11:37:33 -05006186 cset->nr_tasks++;
6187 css_set_move_task(child, NULL, cset, false);
Christian Brauneref2c41c2020-02-05 14:26:22 +01006188 } else {
6189 put_css_set(cset);
6190 cset = NULL;
Tejun Heo0cd9d332020-01-30 11:37:33 -05006191 }
Tejun Heo5153faa2019-10-24 12:03:51 -07006192
Christian Brauner661ee622021-05-08 14:15:38 +02006193 if (!(child->flags & PF_KTHREAD)) {
6194 if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
6195 /*
6196 * If the cgroup has to be frozen, the new task has
6197 * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
6198 * get the task into the frozen state.
6199 */
6200 spin_lock(&child->sighand->siglock);
6201 WARN_ON_ONCE(child->frozen);
6202 child->jobctl |= JOBCTL_TRAP_FREEZE;
6203 spin_unlock(&child->sighand->siglock);
6204
6205 /*
6206 * Calling cgroup_update_frozen() isn't required here,
6207 * because it will be called anyway a bit later from
6208 * do_freezer_trap(). So we avoid cgroup's transient
6209 * switch from the frozen state and back.
6210 */
6211 }
Roman Gushchin76f969e2019-04-19 10:03:04 -07006212
6213 /*
Christian Brauner661ee622021-05-08 14:15:38 +02006214 * If the cgroup is to be killed notice it now and take the
6215 * child down right after we finished preparing it for
6216 * userspace.
Roman Gushchin76f969e2019-04-19 10:03:04 -07006217 */
Christian Brauner661ee622021-05-08 14:15:38 +02006218 kill = test_bit(CGRP_KILL, &cgrp_flags);
Paul Menage817929e2007-10-18 23:39:36 -07006219 }
Tejun Heo5edee612012-10-16 15:03:14 -07006220
Tejun Heo5153faa2019-10-24 12:03:51 -07006221 spin_unlock_irq(&css_set_lock);
6222
Tejun Heo5edee612012-10-16 15:03:14 -07006223 /*
6224 * Call ss->fork(). This must happen after @child is linked on
6225 * css_set; otherwise, @child might change state between ->fork()
6226 * and addition to css_set.
6227 */
Tejun Heob4e0eea2016-02-22 22:25:46 -05006228 do_each_subsys_mask(ss, i, have_fork_callback) {
Oleg Nesterovb53202e2015-12-03 10:24:08 -05006229 ss->fork(child);
Tejun Heob4e0eea2016-02-22 22:25:46 -05006230 } while_each_subsys_mask();
Christian Brauner5a5cf5c2020-02-05 14:26:20 +01006231
Christian Brauneref2c41c2020-02-05 14:26:22 +01006232 /* Make the new cset the root_cset of the new cgroup namespace. */
6233 if (kargs->flags & CLONE_NEWCGROUP) {
6234 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6235
6236 get_css_set(cset);
6237 child->nsproxy->cgroup_ns->root_cset = cset;
6238 put_css_set(rcset);
6239 }
6240
Christian Brauner661ee622021-05-08 14:15:38 +02006241 /* Cgroup has to be killed so take down child immediately. */
6242 if (unlikely(kill))
6243 do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
6244
Christian Brauneref2c41c2020-02-05 14:26:22 +01006245 cgroup_css_set_put_fork(kargs);
Paul Menage817929e2007-10-18 23:39:36 -07006246}
Tejun Heo5edee612012-10-16 15:03:14 -07006247
Paul Menage817929e2007-10-18 23:39:36 -07006248/**
Paul Menageb4f48b62007-10-18 23:39:33 -07006249 * cgroup_exit - detach cgroup from exiting task
6250 * @tsk: pointer to task_struct of exiting process
6251 *
Michal Koutnýe7c7b1d2019-10-04 12:57:39 +02006252 * Description: Detach cgroup from @tsk.
Paul Menageb4f48b62007-10-18 23:39:33 -07006253 *
Paul Menageb4f48b62007-10-18 23:39:33 -07006254 */
Li Zefan1ec41832014-03-28 15:22:19 +08006255void cgroup_exit(struct task_struct *tsk)
Paul Menageb4f48b62007-10-18 23:39:33 -07006256{
Tejun Heo30159ec2013-06-25 11:53:37 -07006257 struct cgroup_subsys *ss;
Tejun Heo5abb8852013-06-12 21:04:49 -07006258 struct css_set *cset;
Peter Zijlstrad41d5a02011-02-07 17:02:20 +01006259 int i;
Paul Menage817929e2007-10-18 23:39:36 -07006260
Tejun Heo5153faa2019-10-24 12:03:51 -07006261 spin_lock_irq(&css_set_lock);
6262
6263 WARN_ON_ONCE(list_empty(&tsk->cg_list));
Tejun Heo0de09422015-10-15 16:41:49 -04006264 cset = task_css_set(tsk);
Tejun Heo5153faa2019-10-24 12:03:51 -07006265 css_set_move_task(tsk, cset, NULL, false);
6266 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6267 cset->nr_tasks--;
Tejun Heo0de09422015-10-15 16:41:49 -04006268
Tejun Heo5153faa2019-10-24 12:03:51 -07006269 WARN_ON_ONCE(cgroup_task_frozen(tsk));
Roman Gushchinf4f809f2021-05-10 14:39:46 -07006270 if (unlikely(!(tsk->flags & PF_KTHREAD) &&
6271 test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
Tejun Heo5153faa2019-10-24 12:03:51 -07006272 cgroup_update_frozen(task_dfl_cgroup(tsk));
Roman Gushchin76f969e2019-04-19 10:03:04 -07006273
Tejun Heo5153faa2019-10-24 12:03:51 -07006274 spin_unlock_irq(&css_set_lock);
Paul Menage817929e2007-10-18 23:39:36 -07006275
Aleksa Saraicb4a3162015-06-06 10:02:14 +10006276 /* see cgroup_post_fork() for details */
Tejun Heob4e0eea2016-02-22 22:25:46 -05006277 do_each_subsys_mask(ss, i, have_exit_callback) {
Tejun Heo2e91fa72015-10-15 16:41:53 -04006278 ss->exit(tsk);
Tejun Heob4e0eea2016-02-22 22:25:46 -05006279 } while_each_subsys_mask();
Tejun Heo2e91fa72015-10-15 16:41:53 -04006280}
Tejun Heo30159ec2013-06-25 11:53:37 -07006281
Oleg Nesterov51bee5a2019-01-28 17:00:13 +01006282void cgroup_release(struct task_struct *task)
Tejun Heo2e91fa72015-10-15 16:41:53 -04006283{
Tejun Heoafcf6c82015-10-15 16:41:53 -04006284 struct cgroup_subsys *ss;
6285 int ssid;
6286
Oleg Nesterov51bee5a2019-01-28 17:00:13 +01006287 do_each_subsys_mask(ss, ssid, have_release_callback) {
6288 ss->release(task);
Tejun Heob4e0eea2016-02-22 22:25:46 -05006289 } while_each_subsys_mask();
Tejun Heoc03cd772019-05-31 10:38:58 -07006290
Tejun Heo5153faa2019-10-24 12:03:51 -07006291 spin_lock_irq(&css_set_lock);
6292 css_set_skip_task_iters(task_css_set(task), task);
6293 list_del_init(&task->cg_list);
6294 spin_unlock_irq(&css_set_lock);
Oleg Nesterov51bee5a2019-01-28 17:00:13 +01006295}
Peter Zijlstrad41d5a02011-02-07 17:02:20 +01006296
Oleg Nesterov51bee5a2019-01-28 17:00:13 +01006297void cgroup_free(struct task_struct *task)
6298{
6299 struct css_set *cset = task_css_set(task);
Tejun Heo2e91fa72015-10-15 16:41:53 -04006300 put_css_set(cset);
Paul Menageb4f48b62007-10-18 23:39:33 -07006301}
Paul Menage697f4162007-10-18 23:39:34 -07006302
Paul Menage8bab8dd2008-04-04 14:29:57 -07006303static int __init cgroup_disable(char *str)
6304{
Tejun Heo30159ec2013-06-25 11:53:37 -07006305 struct cgroup_subsys *ss;
Paul Menage8bab8dd2008-04-04 14:29:57 -07006306 char *token;
Tejun Heo30159ec2013-06-25 11:53:37 -07006307 int i;
Paul Menage8bab8dd2008-04-04 14:29:57 -07006308
6309 while ((token = strsep(&str, ",")) != NULL) {
6310 if (!*token)
6311 continue;
Paul Menage8bab8dd2008-04-04 14:29:57 -07006312
Tejun Heo3ed80a62014-02-08 10:36:58 -05006313 for_each_subsys(ss, i) {
Tejun Heo3e1d2ee2015-08-18 13:58:16 -07006314 if (strcmp(token, ss->name) &&
6315 strcmp(token, ss->legacy_name))
6316 continue;
Tejun Heoa3e72732015-09-25 16:24:27 -04006317 cgroup_disable_mask |= 1 << i;
Paul Menage8bab8dd2008-04-04 14:29:57 -07006318 }
6319 }
6320 return 1;
6321}
6322__setup("cgroup_disable=", cgroup_disable);
KAMEZAWA Hiroyuki38460b42009-04-02 16:57:25 -07006323
Waiman Long5cf81142018-11-08 10:08:46 -05006324void __init __weak enable_debug_cgroup(void) { }
6325
6326static int __init enable_cgroup_debug(char *str)
6327{
6328 cgroup_debug = true;
6329 enable_debug_cgroup();
6330 return 1;
6331}
6332__setup("cgroup_debug", enable_cgroup_debug);
6333
Tejun Heob77d7b62013-08-13 11:01:54 -04006334/**
Tejun Heoec903c02014-05-13 12:11:01 -04006335 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
Tejun Heo35cf0832013-08-26 18:40:56 -04006336 * @dentry: directory dentry of interest
6337 * @ss: subsystem of interest
Tejun Heob77d7b62013-08-13 11:01:54 -04006338 *
Tejun Heo5a17f542014-02-11 11:52:47 -05006339 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
6340 * to get the corresponding css and return it. If such css doesn't exist
6341 * or can't be pinned, an ERR_PTR value is returned.
Stephane Eraniane5d13672011-02-14 11:20:01 +02006342 */
Tejun Heoec903c02014-05-13 12:11:01 -04006343struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6344 struct cgroup_subsys *ss)
Stephane Eraniane5d13672011-02-14 11:20:01 +02006345{
Tejun Heo2bd59d42014-02-11 11:52:49 -05006346 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
Tejun Heof17fc252016-02-23 10:00:51 -05006347 struct file_system_type *s_type = dentry->d_sb->s_type;
Tejun Heo2bd59d42014-02-11 11:52:49 -05006348 struct cgroup_subsys_state *css = NULL;
Stephane Eraniane5d13672011-02-14 11:20:01 +02006349 struct cgroup *cgrp;
Stephane Eraniane5d13672011-02-14 11:20:01 +02006350
Tejun Heo35cf0832013-08-26 18:40:56 -04006351 /* is @dentry a cgroup dir? */
Tejun Heof17fc252016-02-23 10:00:51 -05006352 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6353 !kn || kernfs_type(kn) != KERNFS_DIR)
Stephane Eraniane5d13672011-02-14 11:20:01 +02006354 return ERR_PTR(-EBADF);
6355
Tejun Heo5a17f542014-02-11 11:52:47 -05006356 rcu_read_lock();
6357
Tejun Heo2bd59d42014-02-11 11:52:49 -05006358 /*
6359 * This path doesn't originate from kernfs and @kn could already
6360 * have been or be removed at any point. @kn->priv is RCU
Li Zefana4189482014-09-04 14:43:07 +08006361 * protected for this access. See css_release_work_fn() for details.
Tejun Heo2bd59d42014-02-11 11:52:49 -05006362 */
Tejun Heoe0aed7c2016-12-27 14:49:09 -05006363 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
Tejun Heo2bd59d42014-02-11 11:52:49 -05006364 if (cgrp)
6365 css = cgroup_css(cgrp, ss);
Tejun Heo5a17f542014-02-11 11:52:47 -05006366
Tejun Heoec903c02014-05-13 12:11:01 -04006367 if (!css || !css_tryget_online(css))
Tejun Heo5a17f542014-02-11 11:52:47 -05006368 css = ERR_PTR(-ENOENT);
6369
6370 rcu_read_unlock();
6371 return css;
Stephane Eraniane5d13672011-02-14 11:20:01 +02006372}
Stephane Eraniane5d13672011-02-14 11:20:01 +02006373
Li Zefan1cb650b2013-08-19 10:05:24 +08006374/**
6375 * css_from_id - lookup css by id
6376 * @id: the cgroup id
6377 * @ss: cgroup subsys to be looked into
6378 *
6379 * Returns the css if there's valid one with @id, otherwise returns NULL.
6380 * Should be called under rcu_read_lock().
6381 */
6382struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6383{
Tejun Heo6fa49182014-05-04 15:09:13 -04006384 WARN_ON_ONCE(!rcu_read_lock_held());
Johannes Weinerd6ccc552016-06-17 12:24:27 -04006385 return idr_find(&ss->css_idr, id);
Stephane Eraniane5d13672011-02-14 11:20:01 +02006386}
6387
Tejun Heo16af4392015-11-20 15:55:52 -05006388/**
6389 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
6390 * @path: path on the default hierarchy
6391 *
6392 * Find the cgroup at @path on the default hierarchy, increment its
6393 * reference count and return it. Returns pointer to the found cgroup on
Bhaskar Chowdhury58315c92020-11-09 16:01:11 +05306394 * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
Tejun Heo16af4392015-11-20 15:55:52 -05006395 * if @path points to a non-directory.
6396 */
6397struct cgroup *cgroup_get_from_path(const char *path)
6398{
6399 struct kernfs_node *kn;
6400 struct cgroup *cgrp;
6401
6402 mutex_lock(&cgroup_mutex);
6403
6404 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6405 if (kn) {
6406 if (kernfs_type(kn) == KERNFS_DIR) {
6407 cgrp = kn->priv;
Tejun Heoa590b902017-04-28 15:14:55 -04006408 cgroup_get_live(cgrp);
Tejun Heo16af4392015-11-20 15:55:52 -05006409 } else {
6410 cgrp = ERR_PTR(-ENOTDIR);
6411 }
6412 kernfs_put(kn);
6413 } else {
6414 cgrp = ERR_PTR(-ENOENT);
6415 }
6416
6417 mutex_unlock(&cgroup_mutex);
6418 return cgrp;
6419}
6420EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6421
Martin KaFai Lau1f3fe7e2016-06-30 10:28:42 -07006422/**
6423 * cgroup_get_from_fd - get a cgroup pointer from a fd
6424 * @fd: fd obtained by open(cgroup2_dir)
6425 *
6426 * Find the cgroup from a fd which should be obtained
6427 * by opening a cgroup directory. Returns a pointer to the
6428 * cgroup on success. ERR_PTR is returned if the cgroup
6429 * cannot be found.
6430 */
6431struct cgroup *cgroup_get_from_fd(int fd)
6432{
Martin KaFai Lau1f3fe7e2016-06-30 10:28:42 -07006433 struct cgroup *cgrp;
6434 struct file *f;
6435
6436 f = fget_raw(fd);
6437 if (!f)
6438 return ERR_PTR(-EBADF);
6439
Christian Brauner17703092020-02-05 14:26:19 +01006440 cgrp = cgroup_get_from_file(f);
Martin KaFai Lau1f3fe7e2016-06-30 10:28:42 -07006441 fput(f);
Martin KaFai Lau1f3fe7e2016-06-30 10:28:42 -07006442 return cgrp;
6443}
6444EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6445
Tejun Heo38cf3a62019-06-14 10:12:45 -07006446static u64 power_of_ten(int power)
6447{
6448 u64 v = 1;
6449 while (power--)
6450 v *= 10;
6451 return v;
6452}
6453
6454/**
6455 * cgroup_parse_float - parse a floating number
6456 * @input: input string
6457 * @dec_shift: number of decimal digits to shift
6458 * @v: output
6459 *
6460 * Parse a decimal floating point number in @input and store the result in
6461 * @v with decimal point right shifted @dec_shift times. For example, if
6462 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
6463 * Returns 0 on success, -errno otherwise.
6464 *
6465 * There's nothing cgroup specific about this function except that it's
6466 * currently the only user.
6467 */
6468int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6469{
6470 s64 whole, frac = 0;
6471 int fstart = 0, fend = 0, flen;
6472
6473 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6474 return -EINVAL;
6475 if (frac < 0)
6476 return -EINVAL;
6477
6478 flen = fend > fstart ? fend - fstart : 0;
6479 if (flen < dec_shift)
6480 frac *= power_of_ten(dec_shift - flen);
6481 else
6482 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6483
6484 *v = whole * power_of_ten(dec_shift) + frac;
6485 return 0;
6486}
6487
Tejun Heobd1060a2015-12-07 17:38:53 -05006488/*
6489 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
6490 * definition in cgroup-defs.h.
6491 */
6492#ifdef CONFIG_SOCK_CGROUP_DATA
6493
6494#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6495
Tejun Heo3fa4cc92015-12-14 11:24:06 -05006496DEFINE_SPINLOCK(cgroup_sk_update_lock);
Tejun Heobd1060a2015-12-07 17:38:53 -05006497static bool cgroup_sk_alloc_disabled __read_mostly;
6498
6499void cgroup_sk_alloc_disable(void)
6500{
6501 if (cgroup_sk_alloc_disabled)
6502 return;
6503 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6504 cgroup_sk_alloc_disabled = true;
6505}
6506
6507#else
6508
6509#define cgroup_sk_alloc_disabled false
6510
6511#endif
6512
6513void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6514{
Cong Wangad0f75e2020-07-02 11:52:56 -07006515 if (cgroup_sk_alloc_disabled) {
6516 skcd->no_refcnt = 1;
Johannes Weinerd979a392016-09-19 14:44:38 -07006517 return;
6518 }
6519
Shakeel Butte876ecc2020-03-09 22:16:05 -07006520 /* Don't associate the sock with unrelated interrupted task's cgroup. */
6521 if (in_interrupt())
6522 return;
6523
Tejun Heobd1060a2015-12-07 17:38:53 -05006524 rcu_read_lock();
6525
6526 while (true) {
6527 struct css_set *cset;
6528
6529 cset = task_css_set(current);
6530 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6531 skcd->val = (unsigned long)cset->dfl_cgrp;
Roman Gushchin4bfc0bb2019-05-25 09:37:39 -07006532 cgroup_bpf_get(cset->dfl_cgrp);
Tejun Heobd1060a2015-12-07 17:38:53 -05006533 break;
6534 }
6535 cpu_relax();
6536 }
6537
6538 rcu_read_unlock();
6539}
6540
Cong Wangad0f75e2020-07-02 11:52:56 -07006541void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6542{
6543 if (skcd->val) {
6544 if (skcd->no_refcnt)
6545 return;
6546 /*
6547 * We might be cloning a socket which is left in an empty
6548 * cgroup and the cgroup might have already been rmdir'd.
6549 * Don't use cgroup_get_live().
6550 */
6551 cgroup_get(sock_cgroup_ptr(skcd));
6552 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6553 }
6554}
6555
Tejun Heobd1060a2015-12-07 17:38:53 -05006556void cgroup_sk_free(struct sock_cgroup_data *skcd)
6557{
Roman Gushchin4bfc0bb2019-05-25 09:37:39 -07006558 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6559
Cong Wangad0f75e2020-07-02 11:52:56 -07006560 if (skcd->no_refcnt)
6561 return;
Roman Gushchin4bfc0bb2019-05-25 09:37:39 -07006562 cgroup_bpf_put(cgrp);
6563 cgroup_put(cgrp);
Tejun Heobd1060a2015-12-07 17:38:53 -05006564}
6565
6566#endif /* CONFIG_SOCK_CGROUP_DATA */
6567
Daniel Mack30070982016-11-23 16:52:26 +01006568#ifdef CONFIG_CGROUP_BPF
Andrii Nakryikoaf6eea52020-03-29 19:59:58 -07006569int cgroup_bpf_attach(struct cgroup *cgrp,
6570 struct bpf_prog *prog, struct bpf_prog *replace_prog,
6571 struct bpf_cgroup_link *link,
6572 enum bpf_attach_type type,
Andrey Ignatov7dd68b32019-12-18 23:44:35 -08006573 u32 flags)
Daniel Mack30070982016-11-23 16:52:26 +01006574{
Alexei Starovoitov7f677632017-02-10 20:28:24 -08006575 int ret;
Daniel Mack30070982016-11-23 16:52:26 +01006576
6577 mutex_lock(&cgroup_mutex);
Andrii Nakryikoaf6eea52020-03-29 19:59:58 -07006578 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
Alexei Starovoitov324bda9e62017-10-02 22:50:21 -07006579 mutex_unlock(&cgroup_mutex);
6580 return ret;
6581}
Andrii Nakryikoaf6eea52020-03-29 19:59:58 -07006582
Alexei Starovoitov324bda9e62017-10-02 22:50:21 -07006583int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
Andrii Nakryikoaf6eea52020-03-29 19:59:58 -07006584 enum bpf_attach_type type)
Alexei Starovoitov324bda9e62017-10-02 22:50:21 -07006585{
6586 int ret;
6587
6588 mutex_lock(&cgroup_mutex);
Andrii Nakryikoaf6eea52020-03-29 19:59:58 -07006589 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
Daniel Mack30070982016-11-23 16:52:26 +01006590 mutex_unlock(&cgroup_mutex);
Alexei Starovoitov7f677632017-02-10 20:28:24 -08006591 return ret;
Daniel Mack30070982016-11-23 16:52:26 +01006592}
Andrii Nakryikoaf6eea52020-03-29 19:59:58 -07006593
Alexei Starovoitov468e2f62017-10-02 22:50:22 -07006594int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
6595 union bpf_attr __user *uattr)
6596{
6597 int ret;
6598
6599 mutex_lock(&cgroup_mutex);
6600 ret = __cgroup_bpf_query(cgrp, attr, uattr);
6601 mutex_unlock(&cgroup_mutex);
6602 return ret;
6603}
Daniel Mack30070982016-11-23 16:52:26 +01006604#endif /* CONFIG_CGROUP_BPF */
Roman Gushchin01ee6cf2017-11-06 13:30:28 -05006605
6606#ifdef CONFIG_SYSFS
6607static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6608 ssize_t size, const char *prefix)
6609{
6610 struct cftype *cft;
6611 ssize_t ret = 0;
6612
6613 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6614 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6615 continue;
6616
6617 if (prefix)
6618 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6619
6620 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6621
Yangtao Li4d9ebbe2018-11-03 22:27:41 -04006622 if (WARN_ON(ret >= size))
Roman Gushchin01ee6cf2017-11-06 13:30:28 -05006623 break;
Roman Gushchin01ee6cf2017-11-06 13:30:28 -05006624 }
6625
6626 return ret;
6627}
6628
6629static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6630 char *buf)
6631{
6632 struct cgroup_subsys *ss;
6633 int ssid;
6634 ssize_t ret = 0;
6635
6636 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6637 NULL);
6638
6639 for_each_subsys(ss, ssid)
6640 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6641 PAGE_SIZE - ret,
6642 cgroup_subsys_name[ssid]);
6643
6644 return ret;
6645}
6646static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6647
Roman Gushchin5f2e6732017-11-06 13:30:29 -05006648static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6649 char *buf)
6650{
Johannes Weiner8a931f82020-04-01 21:07:07 -07006651 return snprintf(buf, PAGE_SIZE,
6652 "nsdelegate\n"
6653 "memory_localevents\n"
6654 "memory_recursiveprot\n");
Roman Gushchin5f2e6732017-11-06 13:30:29 -05006655}
6656static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6657
Roman Gushchin01ee6cf2017-11-06 13:30:28 -05006658static struct attribute *cgroup_sysfs_attrs[] = {
6659 &cgroup_delegate_attr.attr,
Roman Gushchin5f2e6732017-11-06 13:30:29 -05006660 &cgroup_features_attr.attr,
Roman Gushchin01ee6cf2017-11-06 13:30:28 -05006661 NULL,
6662};
6663
6664static const struct attribute_group cgroup_sysfs_attr_group = {
6665 .attrs = cgroup_sysfs_attrs,
6666 .name = "cgroup",
6667};
6668
6669static int __init cgroup_sysfs_init(void)
6670{
6671 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6672}
6673subsys_initcall(cgroup_sysfs_init);
Tejun Heoa5e112e2019-05-13 12:37:17 -07006674
Roman Gushchin01ee6cf2017-11-06 13:30:28 -05006675#endif /* CONFIG_SYSFS */