Blame - kernel/cpuset.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 00e8f2575512c6159c9a2ad76718f96f308042f5 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* kernel/cpuset.c
				3	*
				4	* Processor and Memory placement constraints for sets of tasks.
				5	*
				6	* Copyright (C) 2003 BULL SA.
				7	* Copyright (C) 2004 Silicon Graphics, Inc.
				8	*
				9	* Portions derived from Patrick Mochel's sysfs code.
				10	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				11	* Portions Copyright (c) 2004 Silicon Graphics, Inc.
				12	*
				13	* 2003-10-10 Written by Simon Derr <simon.derr@bull.net>
				14	* 2003-10-22 Updates by Stephen Hemminger.
				15	* 2004 May-July Rework by Paul Jackson <pj@sgi.com>
				16	*
				17	* This file is subject to the terms and conditions of the GNU General Public
				18	* License. See the file COPYING in the main directory of the Linux
				19	* distribution for more details.
				20	*/
				21
				22	#include <linux/config.h>
				23	#include <linux/cpu.h>
				24	#include <linux/cpumask.h>
				25	#include <linux/cpuset.h>
				26	#include <linux/err.h>
				27	#include <linux/errno.h>
				28	#include <linux/file.h>
				29	#include <linux/fs.h>
				30	#include <linux/init.h>
				31	#include <linux/interrupt.h>
				32	#include <linux/kernel.h>
				33	#include <linux/kmod.h>
				34	#include <linux/list.h>
				35	#include <linux/mm.h>
				36	#include <linux/module.h>
				37	#include <linux/mount.h>
				38	#include <linux/namei.h>
				39	#include <linux/pagemap.h>
				40	#include <linux/proc_fs.h>
				41	#include <linux/sched.h>
				42	#include <linux/seq_file.h>
				43	#include <linux/slab.h>
				44	#include <linux/smp_lock.h>
				45	#include <linux/spinlock.h>
				46	#include <linux/stat.h>
				47	#include <linux/string.h>
				48	#include <linux/time.h>
				49	#include <linux/backing-dev.h>
				50	#include <linux/sort.h>
				51
				52	#include <asm/uaccess.h>
				53	#include <asm/atomic.h>
				54	#include <asm/semaphore.h>
				55
				56	#define CPUSET_SUPER_MAGIC 0x27e0eb
				57
				58	struct cpuset {
				59	unsigned long flags; /* "unsigned long" so bitops work */
				60	cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
				61	nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
				62
				63	atomic_t count; /* count tasks using this cpuset */
				64
				65	/*
				66	* We link our 'sibling' struct into our parents 'children'.
				67	* Our children link their 'sibling' into our 'children'.
				68	*/
				69	struct list_head sibling; /* my parents children */
				70	struct list_head children; /* my children */
				71
				72	struct cpuset parent; / my parent */
				73	struct dentry dentry; / cpuset fs entry */
				74
				75	/*
				76	* Copy of global cpuset_mems_generation as of the most
				77	* recent time this cpuset changed its mems_allowed.
				78	*/
				79	int mems_generation;
				80	};
				81
				82	/* bits in struct cpuset flags field */
				83	typedef enum {
				84	CS_CPU_EXCLUSIVE,
				85	CS_MEM_EXCLUSIVE,
				86	CS_REMOVED,
				87	CS_NOTIFY_ON_RELEASE
				88	} cpuset_flagbits_t;
				89
				90	/* convenient tests for these bits */
				91	static inline int is_cpu_exclusive(const struct cpuset *cs)
				92	{
				93	return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
				94	}
				95
				96	static inline int is_mem_exclusive(const struct cpuset *cs)
				97	{
				98	return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
				99	}
				100
				101	static inline int is_removed(const struct cpuset *cs)
				102	{
				103	return !!test_bit(CS_REMOVED, &cs->flags);
				104	}
				105
				106	static inline int notify_on_release(const struct cpuset *cs)
				107	{
				108	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
				109	}
				110
				111	/*
				112	* Increment this atomic integer everytime any cpuset changes its
				113	* mems_allowed value. Users of cpusets can track this generation
				114	* number, and avoid having to lock and reload mems_allowed unless
				115	* the cpuset they're using changes generation.
				116	*
				117	* A single, global generation is needed because attach_task() could
				118	* reattach a task to a different cpuset, which must not have its
				119	* generation numbers aliased with those of that tasks previous cpuset.
				120	*
				121	* Generations are needed for mems_allowed because one task cannot
				122	* modify anothers memory placement. So we must enable every task,
				123	* on every visit to __alloc_pages(), to efficiently check whether
				124	* its current->cpuset->mems_allowed has changed, requiring an update
				125	* of its current->mems_allowed.
				126	*/
				127	static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
				128
				129	static struct cpuset top_cpuset = {
				130	.flags = ((1 << CS_CPU_EXCLUSIVE) \| (1 << CS_MEM_EXCLUSIVE)),
				131	.cpus_allowed = CPU_MASK_ALL,
				132	.mems_allowed = NODE_MASK_ALL,
				133	.count = ATOMIC_INIT(0),
				134	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
				135	.children = LIST_HEAD_INIT(top_cpuset.children),
				136	.parent = NULL,
				137	.dentry = NULL,
				138	.mems_generation = 0,
				139	};
				140
				141	static struct vfsmount *cpuset_mount;
				142	static struct super_block *cpuset_sb = NULL;
				143
				144	/*
				145	* cpuset_sem should be held by anyone who is depending on the children
				146	* or sibling lists of any cpuset, or performing non-atomic operations
				147	* on the flags or *_allowed values of a cpuset, such as raising the
				148	* CS_REMOVED flag bit iff it is not already raised, or reading and
				149	* conditionally modifying the *_allowed values. One kernel global
				150	* cpuset semaphore should be sufficient - these things don't change
				151	* that much.
				152	*
				153	* The code that modifies cpusets holds cpuset_sem across the entire
				154	* operation, from cpuset_common_file_write() down, single threading
				155	* all cpuset modifications (except for counter manipulations from
				156	* fork and exit) across the system. This presumes that cpuset
				157	* modifications are rare - better kept simple and safe, even if slow.
				158	*
				159	* The code that reads cpusets, such as in cpuset_common_file_read()
				160	* and below, only holds cpuset_sem across small pieces of code, such
				161	* as when reading out possibly multi-word cpumasks and nodemasks, as
				162	* the risks are less, and the desire for performance a little greater.
				163	* The proc_cpuset_show() routine needs to hold cpuset_sem to insure
				164	* that no cs->dentry is NULL, as it walks up the cpuset tree to root.
				165	*
				166	* The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
				167	* (usually) grab cpuset_sem. These are the two most performance
				168	* critical pieces of code here. The exception occurs on exit(),
Paul Jackson	2efe86b	2005-05-27 02:02:43 -0700	[diff] [blame^]	169	* when a task in a notify_on_release cpuset exits. Then cpuset_sem
				170	* is taken, and if the cpuset count is zero, a usermode call made
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	171	* to /sbin/cpuset_release_agent with the name of the cpuset (path
				172	* relative to the root of cpuset file system) as the argument.
				173	*
				174	* A cpuset can only be deleted if both its 'count' of using tasks is
				175	* zero, and its list of 'children' cpusets is empty. Since all tasks
				176	* in the system use _some_ cpuset, and since there is always at least
				177	* one task in the system (init, pid == 1), therefore, top_cpuset
				178	* always has either children cpusets and/or using tasks. So no need
				179	* for any special hack to ensure that top_cpuset cannot be deleted.
				180	*/
				181
				182	static DECLARE_MUTEX(cpuset_sem);
				183
				184	/*
				185	* A couple of forward declarations required, due to cyclic reference loop:
				186	* cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
				187	* -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
				188	*/
				189
				190	static int cpuset_mkdir(struct inode dir, struct dentry dentry, int mode);
				191	static int cpuset_rmdir(struct inode unused_dir, struct dentry dentry);
				192
				193	static struct backing_dev_info cpuset_backing_dev_info = {
				194	.ra_pages = 0, /* No readahead */
				195	.capabilities = BDI_CAP_NO_ACCT_DIRTY \| BDI_CAP_NO_WRITEBACK,
				196	};
				197
				198	static struct inode *cpuset_new_inode(mode_t mode)
				199	{
				200	struct inode *inode = new_inode(cpuset_sb);
				201
				202	if (inode) {
				203	inode->i_mode = mode;
				204	inode->i_uid = current->fsuid;
				205	inode->i_gid = current->fsgid;
				206	inode->i_blksize = PAGE_CACHE_SIZE;
				207	inode->i_blocks = 0;
				208	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				209	inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
				210	}
				211	return inode;
				212	}
				213
				214	static void cpuset_diput(struct dentry dentry, struct inode inode)
				215	{
				216	/* is dentry a directory ? if so, kfree() associated cpuset */
				217	if (S_ISDIR(inode->i_mode)) {
				218	struct cpuset *cs = dentry->d_fsdata;
				219	BUG_ON(!(is_removed(cs)));
				220	kfree(cs);
				221	}
				222	iput(inode);
				223	}
				224
				225	static struct dentry_operations cpuset_dops = {
				226	.d_iput = cpuset_diput,
				227	};
				228
				229	static struct dentry cpuset_get_dentry(struct dentry parent, const char *name)
				230	{
				231	struct qstr qstr;
				232	struct dentry *d;
				233
				234	qstr.name = name;
				235	qstr.len = strlen(name);
				236	qstr.hash = full_name_hash(name, qstr.len);
				237	d = lookup_hash(&qstr, parent);
				238	if (!IS_ERR(d))
				239	d->d_op = &cpuset_dops;
				240	return d;
				241	}
				242
				243	static void remove_dir(struct dentry *d)
				244	{
				245	struct dentry *parent = dget(d->d_parent);
				246
				247	d_delete(d);
				248	simple_rmdir(parent->d_inode, d);
				249	dput(parent);
				250	}
				251
				252	/*
				253	* NOTE : the dentry must have been dget()'ed
				254	*/
				255	static void cpuset_d_remove_dir(struct dentry *dentry)
				256	{
				257	struct list_head *node;
				258
				259	spin_lock(&dcache_lock);
				260	node = dentry->d_subdirs.next;
				261	while (node != &dentry->d_subdirs) {
				262	struct dentry *d = list_entry(node, struct dentry, d_child);
				263	list_del_init(node);
				264	if (d->d_inode) {
				265	d = dget_locked(d);
				266	spin_unlock(&dcache_lock);
				267	d_delete(d);
				268	simple_unlink(dentry->d_inode, d);
				269	dput(d);
				270	spin_lock(&dcache_lock);
				271	}
				272	node = dentry->d_subdirs.next;
				273	}
				274	list_del_init(&dentry->d_child);
				275	spin_unlock(&dcache_lock);
				276	remove_dir(dentry);
				277	}
				278
				279	static struct super_operations cpuset_ops = {
				280	.statfs = simple_statfs,
				281	.drop_inode = generic_delete_inode,
				282	};
				283
				284	static int cpuset_fill_super(struct super_block sb, void unused_data,
				285	int unused_silent)
				286	{
				287	struct inode *inode;
				288	struct dentry *root;
				289
				290	sb->s_blocksize = PAGE_CACHE_SIZE;
				291	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
				292	sb->s_magic = CPUSET_SUPER_MAGIC;
				293	sb->s_op = &cpuset_ops;
				294	cpuset_sb = sb;
				295
				296	inode = cpuset_new_inode(S_IFDIR \| S_IRUGO \| S_IXUGO \| S_IWUSR);
				297	if (inode) {
				298	inode->i_op = &simple_dir_inode_operations;
				299	inode->i_fop = &simple_dir_operations;
				300	/* directories start off with i_nlink == 2 (for "." entry) */
				301	inode->i_nlink++;
				302	} else {
				303	return -ENOMEM;
				304	}
				305
				306	root = d_alloc_root(inode);
				307	if (!root) {
				308	iput(inode);
				309	return -ENOMEM;
				310	}
				311	sb->s_root = root;
				312	return 0;
				313	}
				314
				315	static struct super_block cpuset_get_sb(struct file_system_type fs_type,
				316	int flags, const char *unused_dev_name,
				317	void *data)
				318	{
				319	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
				320	}
				321
				322	static struct file_system_type cpuset_fs_type = {
				323	.name = "cpuset",
				324	.get_sb = cpuset_get_sb,
				325	.kill_sb = kill_litter_super,
				326	};
				327
				328	/* struct cftype:
				329	*
				330	* The files in the cpuset filesystem mostly have a very simple read/write
				331	* handling, some common function will take care of it. Nevertheless some cases
				332	* (read tasks) are special and therefore I define this structure for every
				333	* kind of file.
				334	*
				335	*
				336	* When reading/writing to a file:
				337	* - the cpuset to use in file->f_dentry->d_parent->d_fsdata
				338	* - the 'cftype' of the file is file->f_dentry->d_fsdata
				339	*/
				340
				341	struct cftype {
				342	char *name;
				343	int private;
				344	int (open) (struct inode inode, struct file *file);
				345	ssize_t (read) (struct file file, char __user *buf, size_t nbytes,
				346	loff_t *ppos);
				347	int (write) (struct file file, const char __user *buf, size_t nbytes,
				348	loff_t *ppos);
				349	int (release) (struct inode inode, struct file *file);
				350	};
				351
				352	static inline struct cpuset __d_cs(struct dentry dentry)
				353	{
				354	return dentry->d_fsdata;
				355	}
				356
				357	static inline struct cftype __d_cft(struct dentry dentry)
				358	{
				359	return dentry->d_fsdata;
				360	}
				361
				362	/*
				363	* Call with cpuset_sem held. Writes path of cpuset into buf.
				364	* Returns 0 on success, -errno on error.
				365	*/
				366
				367	static int cpuset_path(const struct cpuset cs, char buf, int buflen)
				368	{
				369	char *start;
				370
				371	start = buf + buflen;
				372
				373	*--start = '\0';
				374	for (;;) {
				375	int len = cs->dentry->d_name.len;
				376	if ((start -= len) < buf)
				377	return -ENAMETOOLONG;
				378	memcpy(start, cs->dentry->d_name.name, len);
				379	cs = cs->parent;
				380	if (!cs)
				381	break;
				382	if (!cs->parent)
				383	continue;
				384	if (--start < buf)
				385	return -ENAMETOOLONG;
				386	*start = '/';
				387	}
				388	memmove(buf, start, buf + buflen - start);
				389	return 0;
				390	}
				391
				392	/*
				393	* Notify userspace when a cpuset is released, by running
				394	* /sbin/cpuset_release_agent with the name of the cpuset (path
				395	* relative to the root of cpuset file system) as the argument.
				396	*
				397	* Most likely, this user command will try to rmdir this cpuset.
				398	*
				399	* This races with the possibility that some other task will be
				400	* attached to this cpuset before it is removed, or that some other
				401	* user task will 'mkdir' a child cpuset of this cpuset. That's ok.
				402	* The presumed 'rmdir' will fail quietly if this cpuset is no longer
				403	* unused, and this cpuset will be reprieved from its death sentence,
				404	* to continue to serve a useful existence. Next time it's released,
				405	* we will get notified again, if it still has 'notify_on_release' set.
				406	*
				407	* Note final arg to call_usermodehelper() is 0 - that means
				408	* don't wait. Since we are holding the global cpuset_sem here,
				409	* and we are asking another thread (started from keventd) to rmdir a
				410	* cpuset, we can't wait - or we'd deadlock with the removing thread
				411	* on cpuset_sem.
				412	*/
				413
				414	static int cpuset_release_agent(char *cpuset_str)
				415	{
				416	char argv[3], envp[3];
				417	int i;
				418
				419	i = 0;
				420	argv[i++] = "/sbin/cpuset_release_agent";
				421	argv[i++] = cpuset_str;
				422	argv[i] = NULL;
				423
				424	i = 0;
				425	/* minimal command environment */
				426	envp[i++] = "HOME=/";
				427	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
				428	envp[i] = NULL;
				429
				430	return call_usermodehelper(argv[0], argv, envp, 0);
				431	}
				432
				433	/*
				434	* Either cs->count of using tasks transitioned to zero, or the
				435	* cs->children list of child cpusets just became empty. If this
				436	* cs is notify_on_release() and now both the user count is zero and
				437	* the list of children is empty, send notice to user land.
				438	*/
				439
				440	static void check_for_release(struct cpuset *cs)
				441	{
				442	if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
				443	list_empty(&cs->children)) {
				444	char *buf;
				445
				446	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
				447	if (!buf)
				448	return;
				449	if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
				450	goto out;
				451	cpuset_release_agent(buf);
				452	out:
				453	kfree(buf);
				454	}
				455	}
				456
				457	/*
				458	* Return in *pmask the portion of a cpusets's cpus_allowed that
				459	* are online. If none are online, walk up the cpuset hierarchy
				460	* until we find one that does have some online cpus. If we get
				461	* all the way to the top and still haven't found any online cpus,
				462	* return cpu_online_map. Or if passed a NULL cs from an exit'ing
				463	* task, return cpu_online_map.
				464	*
				465	* One way or another, we guarantee to return some non-empty subset
				466	* of cpu_online_map.
				467	*
				468	* Call with cpuset_sem held.
				469	*/
				470
				471	static void guarantee_online_cpus(const struct cpuset cs, cpumask_t pmask)
				472	{
				473	while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
				474	cs = cs->parent;
				475	if (cs)
				476	cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
				477	else
				478	*pmask = cpu_online_map;
				479	BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
				480	}
				481
				482	/*
				483	* Return in *pmask the portion of a cpusets's mems_allowed that
				484	* are online. If none are online, walk up the cpuset hierarchy
				485	* until we find one that does have some online mems. If we get
				486	* all the way to the top and still haven't found any online mems,
				487	* return node_online_map.
				488	*
				489	* One way or another, we guarantee to return some non-empty subset
				490	* of node_online_map.
				491	*
				492	* Call with cpuset_sem held.
				493	*/
				494
				495	static void guarantee_online_mems(const struct cpuset cs, nodemask_t pmask)
				496	{
				497	while (cs && !nodes_intersects(cs->mems_allowed, node_online_map))
				498	cs = cs->parent;
				499	if (cs)
				500	nodes_and(*pmask, cs->mems_allowed, node_online_map);
				501	else
				502	*pmask = node_online_map;
				503	BUG_ON(!nodes_intersects(*pmask, node_online_map));
				504	}
				505
				506	/*
				507	* Refresh current tasks mems_allowed and mems_generation from
				508	* current tasks cpuset. Call with cpuset_sem held.
				509	*
				510	* Be sure to call refresh_mems() on any cpuset operation which
				511	* (1) holds cpuset_sem, and (2) might possibly alloc memory.
				512	* Call after obtaining cpuset_sem lock, before any possible
				513	* allocation. Otherwise one risks trying to allocate memory
				514	* while the task cpuset_mems_generation is not the same as
				515	* the mems_generation in its cpuset, which would deadlock on
				516	* cpuset_sem in cpuset_update_current_mems_allowed().
				517	*
				518	* Since we hold cpuset_sem, once refresh_mems() is called, the
				519	* test (current->cpuset_mems_generation != cs->mems_generation)
				520	* in cpuset_update_current_mems_allowed() will remain false,
				521	* until we drop cpuset_sem. Anyone else who would change our
				522	* cpusets mems_generation needs to lock cpuset_sem first.
				523	*/
				524
				525	static void refresh_mems(void)
				526	{
				527	struct cpuset *cs = current->cpuset;
				528
				529	if (current->cpuset_mems_generation != cs->mems_generation) {
				530	guarantee_online_mems(cs, &current->mems_allowed);
				531	current->cpuset_mems_generation = cs->mems_generation;
				532	}
				533	}
				534
				535	/*
				536	* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
				537	*
				538	* One cpuset is a subset of another if all its allowed CPUs and
				539	* Memory Nodes are a subset of the other, and its exclusive flags
				540	* are only set if the other's are set.
				541	*/
				542
				543	static int is_cpuset_subset(const struct cpuset p, const struct cpuset q)
				544	{
				545	return cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
				546	nodes_subset(p->mems_allowed, q->mems_allowed) &&
				547	is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
				548	is_mem_exclusive(p) <= is_mem_exclusive(q);
				549	}
				550
				551	/*
				552	* validate_change() - Used to validate that any proposed cpuset change
				553	* follows the structural rules for cpusets.
				554	*
				555	* If we replaced the flag and mask values of the current cpuset
				556	* (cur) with those values in the trial cpuset (trial), would
				557	* our various subset and exclusive rules still be valid? Presumes
				558	* cpuset_sem held.
				559	*
				560	* 'cur' is the address of an actual, in-use cpuset. Operations
				561	* such as list traversal that depend on the actual address of the
				562	* cpuset in the list must use cur below, not trial.
				563	*
				564	* 'trial' is the address of bulk structure copy of cur, with
				565	* perhaps one or more of the fields cpus_allowed, mems_allowed,
				566	* or flags changed to new, trial values.
				567	*
				568	* Return 0 if valid, -errno if not.
				569	*/
				570
				571	static int validate_change(const struct cpuset cur, const struct cpuset trial)
				572	{
				573	struct cpuset c, par;
				574
				575	/* Each of our child cpusets must be a subset of us */
				576	list_for_each_entry(c, &cur->children, sibling) {
				577	if (!is_cpuset_subset(c, trial))
				578	return -EBUSY;
				579	}
				580
				581	/* Remaining checks don't apply to root cpuset */
				582	if ((par = cur->parent) == NULL)
				583	return 0;
				584
				585	/* We must be a subset of our parent cpuset */
				586	if (!is_cpuset_subset(trial, par))
				587	return -EACCES;
				588
				589	/* If either I or some sibling (!= me) is exclusive, we can't overlap */
				590	list_for_each_entry(c, &par->children, sibling) {
				591	if ((is_cpu_exclusive(trial) \|\| is_cpu_exclusive(c)) &&
				592	c != cur &&
				593	cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
				594	return -EINVAL;
				595	if ((is_mem_exclusive(trial) \|\| is_mem_exclusive(c)) &&
				596	c != cur &&
				597	nodes_intersects(trial->mems_allowed, c->mems_allowed))
				598	return -EINVAL;
				599	}
				600
				601	return 0;
				602	}
				603
				604	static int update_cpumask(struct cpuset cs, char buf)
				605	{
				606	struct cpuset trialcs;
				607	int retval;
				608
				609	trialcs = *cs;
				610	retval = cpulist_parse(buf, trialcs.cpus_allowed);
				611	if (retval < 0)
				612	return retval;
				613	cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
				614	if (cpus_empty(trialcs.cpus_allowed))
				615	return -ENOSPC;
				616	retval = validate_change(cs, &trialcs);
				617	if (retval == 0)
				618	cs->cpus_allowed = trialcs.cpus_allowed;
				619	return retval;
				620	}
				621
				622	static int update_nodemask(struct cpuset cs, char buf)
				623	{
				624	struct cpuset trialcs;
				625	int retval;
				626
				627	trialcs = *cs;
				628	retval = nodelist_parse(buf, trialcs.mems_allowed);
				629	if (retval < 0)
				630	return retval;
				631	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
				632	if (nodes_empty(trialcs.mems_allowed))
				633	return -ENOSPC;
				634	retval = validate_change(cs, &trialcs);
				635	if (retval == 0) {
				636	cs->mems_allowed = trialcs.mems_allowed;
				637	atomic_inc(&cpuset_mems_generation);
				638	cs->mems_generation = atomic_read(&cpuset_mems_generation);
				639	}
				640	return retval;
				641	}
				642
				643	/*
				644	* update_flag - read a 0 or a 1 in a file and update associated flag
				645	* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
				646	* CS_NOTIFY_ON_RELEASE)
				647	* cs: the cpuset to update
				648	* buf: the buffer where we read the 0 or 1
				649	*/
				650
				651	static int update_flag(cpuset_flagbits_t bit, struct cpuset cs, char buf)
				652	{
				653	int turning_on;
				654	struct cpuset trialcs;
				655	int err;
				656
				657	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
				658
				659	trialcs = *cs;
				660	if (turning_on)
				661	set_bit(bit, &trialcs.flags);
				662	else
				663	clear_bit(bit, &trialcs.flags);
				664
				665	err = validate_change(cs, &trialcs);
				666	if (err == 0) {
				667	if (turning_on)
				668	set_bit(bit, &cs->flags);
				669	else
				670	clear_bit(bit, &cs->flags);
				671	}
				672	return err;
				673	}
				674
				675	static int attach_task(struct cpuset cs, char buf)
				676	{
				677	pid_t pid;
				678	struct task_struct *tsk;
				679	struct cpuset *oldcs;
				680	cpumask_t cpus;
				681
				682	if (sscanf(buf, "%d", &pid) != 1)
				683	return -EIO;
				684	if (cpus_empty(cs->cpus_allowed) \|\| nodes_empty(cs->mems_allowed))
				685	return -ENOSPC;
				686
				687	if (pid) {
				688	read_lock(&tasklist_lock);
				689
				690	tsk = find_task_by_pid(pid);
				691	if (!tsk) {
				692	read_unlock(&tasklist_lock);
				693	return -ESRCH;
				694	}
				695
				696	get_task_struct(tsk);
				697	read_unlock(&tasklist_lock);
				698
				699	if ((current->euid) && (current->euid != tsk->uid)
				700	&& (current->euid != tsk->suid)) {
				701	put_task_struct(tsk);
				702	return -EACCES;
				703	}
				704	} else {
				705	tsk = current;
				706	get_task_struct(tsk);
				707	}
				708
				709	task_lock(tsk);
				710	oldcs = tsk->cpuset;
				711	if (!oldcs) {
				712	task_unlock(tsk);
				713	put_task_struct(tsk);
				714	return -ESRCH;
				715	}
				716	atomic_inc(&cs->count);
				717	tsk->cpuset = cs;
				718	task_unlock(tsk);
				719
				720	guarantee_online_cpus(cs, &cpus);
				721	set_cpus_allowed(tsk, cpus);
				722
				723	put_task_struct(tsk);
				724	if (atomic_dec_and_test(&oldcs->count))
				725	check_for_release(oldcs);
				726	return 0;
				727	}
				728
				729	/* The various types of files and directories in a cpuset file system */
				730
				731	typedef enum {
				732	FILE_ROOT,
				733	FILE_DIR,
				734	FILE_CPULIST,
				735	FILE_MEMLIST,
				736	FILE_CPU_EXCLUSIVE,
				737	FILE_MEM_EXCLUSIVE,
				738	FILE_NOTIFY_ON_RELEASE,
				739	FILE_TASKLIST,
				740	} cpuset_filetype_t;
				741
				742	static ssize_t cpuset_common_file_write(struct file file, const char __user userbuf,
				743	size_t nbytes, loff_t *unused_ppos)
				744	{
				745	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
				746	struct cftype *cft = __d_cft(file->f_dentry);
				747	cpuset_filetype_t type = cft->private;
				748	char *buffer;
				749	int retval = 0;
				750
				751	/* Crude upper limit on largest legitimate cpulist user might write. */
				752	if (nbytes > 100 + 6 * NR_CPUS)
				753	return -E2BIG;
				754
				755	/* +1 for nul-terminator */
				756	if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
				757	return -ENOMEM;
				758
				759	if (copy_from_user(buffer, userbuf, nbytes)) {
				760	retval = -EFAULT;
				761	goto out1;
				762	}
				763	buffer[nbytes] = 0; /* nul-terminate */
				764
				765	down(&cpuset_sem);
				766
				767	if (is_removed(cs)) {
				768	retval = -ENODEV;
				769	goto out2;
				770	}
				771
				772	switch (type) {
				773	case FILE_CPULIST:
				774	retval = update_cpumask(cs, buffer);
				775	break;
				776	case FILE_MEMLIST:
				777	retval = update_nodemask(cs, buffer);
				778	break;
				779	case FILE_CPU_EXCLUSIVE:
				780	retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
				781	break;
				782	case FILE_MEM_EXCLUSIVE:
				783	retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
				784	break;
				785	case FILE_NOTIFY_ON_RELEASE:
				786	retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
				787	break;
				788	case FILE_TASKLIST:
				789	retval = attach_task(cs, buffer);
				790	break;
				791	default:
				792	retval = -EINVAL;
				793	goto out2;
				794	}
				795
				796	if (retval == 0)
				797	retval = nbytes;
				798	out2:
				799	up(&cpuset_sem);
				800	out1:
				801	kfree(buffer);
				802	return retval;
				803	}
				804
				805	static ssize_t cpuset_file_write(struct file file, const char __user buf,
				806	size_t nbytes, loff_t *ppos)
				807	{
				808	ssize_t retval = 0;
				809	struct cftype *cft = __d_cft(file->f_dentry);
				810	if (!cft)
				811	return -ENODEV;
				812
				813	/* special function ? */
				814	if (cft->write)
				815	retval = cft->write(file, buf, nbytes, ppos);
				816	else
				817	retval = cpuset_common_file_write(file, buf, nbytes, ppos);
				818
				819	return retval;
				820	}
				821
				822	/*
				823	* These ascii lists should be read in a single call, by using a user
				824	* buffer large enough to hold the entire map. If read in smaller
				825	* chunks, there is no guarantee of atomicity. Since the display format
				826	* used, list of ranges of sequential numbers, is variable length,
				827	* and since these maps can change value dynamically, one could read
				828	* gibberish by doing partial reads while a list was changing.
				829	* A single large read to a buffer that crosses a page boundary is
				830	* ok, because the result being copied to user land is not recomputed
				831	* across a page fault.
				832	*/
				833
				834	static int cpuset_sprintf_cpulist(char page, struct cpuset cs)
				835	{
				836	cpumask_t mask;
				837
				838	down(&cpuset_sem);
				839	mask = cs->cpus_allowed;
				840	up(&cpuset_sem);
				841
				842	return cpulist_scnprintf(page, PAGE_SIZE, mask);
				843	}
				844
				845	static int cpuset_sprintf_memlist(char page, struct cpuset cs)
				846	{
				847	nodemask_t mask;
				848
				849	down(&cpuset_sem);
				850	mask = cs->mems_allowed;
				851	up(&cpuset_sem);
				852
				853	return nodelist_scnprintf(page, PAGE_SIZE, mask);
				854	}
				855
				856	static ssize_t cpuset_common_file_read(struct file file, char __user buf,
				857	size_t nbytes, loff_t *ppos)
				858	{
				859	struct cftype *cft = __d_cft(file->f_dentry);
				860	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
				861	cpuset_filetype_t type = cft->private;
				862	char *page;
				863	ssize_t retval = 0;
				864	char *s;
				865	char *start;
				866	size_t n;
				867
				868	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
				869	return -ENOMEM;
				870
				871	s = page;
				872
				873	switch (type) {
				874	case FILE_CPULIST:
				875	s += cpuset_sprintf_cpulist(s, cs);
				876	break;
				877	case FILE_MEMLIST:
				878	s += cpuset_sprintf_memlist(s, cs);
				879	break;
				880	case FILE_CPU_EXCLUSIVE:
				881	*s++ = is_cpu_exclusive(cs) ? '1' : '0';
				882	break;
				883	case FILE_MEM_EXCLUSIVE:
				884	*s++ = is_mem_exclusive(cs) ? '1' : '0';
				885	break;
				886	case FILE_NOTIFY_ON_RELEASE:
				887	*s++ = notify_on_release(cs) ? '1' : '0';
				888	break;
				889	default:
				890	retval = -EINVAL;
				891	goto out;
				892	}
				893	*s++ = '\n';
				894	*s = '\0';
				895
				896	start = page + *ppos;
				897	n = s - start;
				898	retval = n - copy_to_user(buf, start, min(n, nbytes));
				899	*ppos += retval;
				900	out:
				901	free_page((unsigned long)page);
				902	return retval;
				903	}
				904
				905	static ssize_t cpuset_file_read(struct file file, char __user buf, size_t nbytes,
				906	loff_t *ppos)
				907	{
				908	ssize_t retval = 0;
				909	struct cftype *cft = __d_cft(file->f_dentry);
				910	if (!cft)
				911	return -ENODEV;
				912
				913	/* special function ? */
				914	if (cft->read)
				915	retval = cft->read(file, buf, nbytes, ppos);
				916	else
				917	retval = cpuset_common_file_read(file, buf, nbytes, ppos);
				918
				919	return retval;
				920	}
				921
				922	static int cpuset_file_open(struct inode inode, struct file file)
				923	{
				924	int err;
				925	struct cftype *cft;
				926
				927	err = generic_file_open(inode, file);
				928	if (err)
				929	return err;
				930
				931	cft = __d_cft(file->f_dentry);
				932	if (!cft)
				933	return -ENODEV;
				934	if (cft->open)
				935	err = cft->open(inode, file);
				936	else
				937	err = 0;
				938
				939	return err;
				940	}
				941
				942	static int cpuset_file_release(struct inode inode, struct file file)
				943	{
				944	struct cftype *cft = __d_cft(file->f_dentry);
				945	if (cft->release)
				946	return cft->release(inode, file);
				947	return 0;
				948	}
				949
				950	static struct file_operations cpuset_file_operations = {
				951	.read = cpuset_file_read,
				952	.write = cpuset_file_write,
				953	.llseek = generic_file_llseek,
				954	.open = cpuset_file_open,
				955	.release = cpuset_file_release,
				956	};
				957
				958	static struct inode_operations cpuset_dir_inode_operations = {
				959	.lookup = simple_lookup,
				960	.mkdir = cpuset_mkdir,
				961	.rmdir = cpuset_rmdir,
				962	};
				963
				964	static int cpuset_create_file(struct dentry *dentry, int mode)
				965	{
				966	struct inode *inode;
				967
				968	if (!dentry)
				969	return -ENOENT;
				970	if (dentry->d_inode)
				971	return -EEXIST;
				972
				973	inode = cpuset_new_inode(mode);
				974	if (!inode)
				975	return -ENOMEM;
				976
				977	if (S_ISDIR(mode)) {
				978	inode->i_op = &cpuset_dir_inode_operations;
				979	inode->i_fop = &simple_dir_operations;
				980
				981	/* start off with i_nlink == 2 (for "." entry) */
				982	inode->i_nlink++;
				983	} else if (S_ISREG(mode)) {
				984	inode->i_size = 0;
				985	inode->i_fop = &cpuset_file_operations;
				986	}
				987
				988	d_instantiate(dentry, inode);
				989	dget(dentry); /* Extra count - pin the dentry in core */
				990	return 0;
				991	}
				992
				993	/*
				994	* cpuset_create_dir - create a directory for an object.
				995	* cs: the cpuset we create the directory for.
				996	* It must have a valid ->parent field
				997	* And we are going to fill its ->dentry field.
				998	* name: The name to give to the cpuset directory. Will be copied.
				999	* mode: mode to set on new directory.
				1000	*/
				1001
				1002	static int cpuset_create_dir(struct cpuset cs, const char name, int mode)
				1003	{
				1004	struct dentry *dentry = NULL;
				1005	struct dentry *parent;
				1006	int error = 0;
				1007
				1008	parent = cs->parent->dentry;
				1009	dentry = cpuset_get_dentry(parent, name);
				1010	if (IS_ERR(dentry))
				1011	return PTR_ERR(dentry);
				1012	error = cpuset_create_file(dentry, S_IFDIR \| mode);
				1013	if (!error) {
				1014	dentry->d_fsdata = cs;
				1015	parent->d_inode->i_nlink++;
				1016	cs->dentry = dentry;
				1017	}
				1018	dput(dentry);
				1019
				1020	return error;
				1021	}
				1022
				1023	static int cpuset_add_file(struct dentry dir, const struct cftype cft)
				1024	{
				1025	struct dentry *dentry;
				1026	int error;
				1027
				1028	down(&dir->d_inode->i_sem);
				1029	dentry = cpuset_get_dentry(dir, cft->name);
				1030	if (!IS_ERR(dentry)) {
				1031	error = cpuset_create_file(dentry, 0644 \| S_IFREG);
				1032	if (!error)
				1033	dentry->d_fsdata = (void *)cft;
				1034	dput(dentry);
				1035	} else
				1036	error = PTR_ERR(dentry);
				1037	up(&dir->d_inode->i_sem);
				1038	return error;
				1039	}
				1040
				1041	/*
				1042	* Stuff for reading the 'tasks' file.
				1043	*
				1044	* Reading this file can return large amounts of data if a cpuset has
				1045	* lots of attached tasks. So it may need several calls to read(),
				1046	* but we cannot guarantee that the information we produce is correct
				1047	* unless we produce it entirely atomically.
				1048	*
				1049	* Upon tasks file open(), a struct ctr_struct is allocated, that
				1050	* will have a pointer to an array (also allocated here). The struct
				1051	* ctr_struct * is stored in file->private_data. Its resources will
				1052	* be freed by release() when the file is closed. The array is used
				1053	* to sprintf the PIDs and then used by read().
				1054	*/
				1055
				1056	/* cpusets_tasks_read array */
				1057
				1058	struct ctr_struct {
				1059	char *buf;
				1060	int bufsz;
				1061	};
				1062
				1063	/*
				1064	* Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
				1065	* Return actual number of pids loaded.
				1066	*/
				1067	static inline int pid_array_load(pid_t pidarray, int npids, struct cpuset cs)
				1068	{
				1069	int n = 0;
				1070	struct task_struct g, p;
				1071
				1072	read_lock(&tasklist_lock);
				1073
				1074	do_each_thread(g, p) {
				1075	if (p->cpuset == cs) {
				1076	pidarray[n++] = p->pid;
				1077	if (unlikely(n == npids))
				1078	goto array_full;
				1079	}
				1080	} while_each_thread(g, p);
				1081
				1082	array_full:
				1083	read_unlock(&tasklist_lock);
				1084	return n;
				1085	}
				1086
				1087	static int cmppid(const void a, const void b)
				1088	{
				1089	return (pid_t )a - (pid_t )b;
				1090	}
				1091
				1092	/*
				1093	* Convert array 'a' of 'npids' pid_t's to a string of newline separated
				1094	* decimal pids in 'buf'. Don't write more than 'sz' chars, but return
				1095	* count 'cnt' of how many chars would be written if buf were large enough.
				1096	*/
				1097	static int pid_array_to_buf(char buf, int sz, pid_t a, int npids)
				1098	{
				1099	int cnt = 0;
				1100	int i;
				1101
				1102	for (i = 0; i < npids; i++)
				1103	cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
				1104	return cnt;
				1105	}
				1106
				1107	static int cpuset_tasks_open(struct inode unused, struct file file)
				1108	{
				1109	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
				1110	struct ctr_struct *ctr;
				1111	pid_t *pidarray;
				1112	int npids;
				1113	char c;
				1114
				1115	if (!(file->f_mode & FMODE_READ))
				1116	return 0;
				1117
				1118	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
				1119	if (!ctr)
				1120	goto err0;
				1121
				1122	/*
				1123	* If cpuset gets more users after we read count, we won't have
				1124	* enough space - tough. This race is indistinguishable to the
				1125	* caller from the case that the additional cpuset users didn't
				1126	* show up until sometime later on.
				1127	*/
				1128	npids = atomic_read(&cs->count);
				1129	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
				1130	if (!pidarray)
				1131	goto err1;
				1132
				1133	npids = pid_array_load(pidarray, npids, cs);
				1134	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
				1135
				1136	/* Call pid_array_to_buf() twice, first just to get bufsz */
				1137	ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
				1138	ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
				1139	if (!ctr->buf)
				1140	goto err2;
				1141	ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
				1142
				1143	kfree(pidarray);
				1144	file->private_data = ctr;
				1145	return 0;
				1146
				1147	err2:
				1148	kfree(pidarray);
				1149	err1:
				1150	kfree(ctr);
				1151	err0:
				1152	return -ENOMEM;
				1153	}
				1154
				1155	static ssize_t cpuset_tasks_read(struct file file, char __user buf,
				1156	size_t nbytes, loff_t *ppos)
				1157	{
				1158	struct ctr_struct *ctr = file->private_data;
				1159
				1160	if (*ppos + nbytes > ctr->bufsz)
				1161	nbytes = ctr->bufsz - *ppos;
				1162	if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
				1163	return -EFAULT;
				1164	*ppos += nbytes;
				1165	return nbytes;
				1166	}
				1167
				1168	static int cpuset_tasks_release(struct inode unused_inode, struct file file)
				1169	{
				1170	struct ctr_struct *ctr;
				1171
				1172	if (file->f_mode & FMODE_READ) {
				1173	ctr = file->private_data;
				1174	kfree(ctr->buf);
				1175	kfree(ctr);
				1176	}
				1177	return 0;
				1178	}
				1179
				1180	/*
				1181	* for the common functions, 'private' gives the type of file
				1182	*/
				1183
				1184	static struct cftype cft_tasks = {
				1185	.name = "tasks",
				1186	.open = cpuset_tasks_open,
				1187	.read = cpuset_tasks_read,
				1188	.release = cpuset_tasks_release,
				1189	.private = FILE_TASKLIST,
				1190	};
				1191
				1192	static struct cftype cft_cpus = {
				1193	.name = "cpus",
				1194	.private = FILE_CPULIST,
				1195	};
				1196
				1197	static struct cftype cft_mems = {
				1198	.name = "mems",
				1199	.private = FILE_MEMLIST,
				1200	};
				1201
				1202	static struct cftype cft_cpu_exclusive = {
				1203	.name = "cpu_exclusive",
				1204	.private = FILE_CPU_EXCLUSIVE,
				1205	};
				1206
				1207	static struct cftype cft_mem_exclusive = {
				1208	.name = "mem_exclusive",
				1209	.private = FILE_MEM_EXCLUSIVE,
				1210	};
				1211
				1212	static struct cftype cft_notify_on_release = {
				1213	.name = "notify_on_release",
				1214	.private = FILE_NOTIFY_ON_RELEASE,
				1215	};
				1216
				1217	static int cpuset_populate_dir(struct dentry *cs_dentry)
				1218	{
				1219	int err;
				1220
				1221	if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0)
				1222	return err;
				1223	if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
				1224	return err;
				1225	if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
				1226	return err;
				1227	if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
				1228	return err;
				1229	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
				1230	return err;
				1231	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
				1232	return err;
				1233	return 0;
				1234	}
				1235
				1236	/*
				1237	* cpuset_create - create a cpuset
				1238	* parent: cpuset that will be parent of the new cpuset.
				1239	* name: name of the new cpuset. Will be strcpy'ed.
				1240	* mode: mode to set on new inode
				1241	*
				1242	* Must be called with the semaphore on the parent inode held
				1243	*/
				1244
				1245	static long cpuset_create(struct cpuset parent, const char name, int mode)
				1246	{
				1247	struct cpuset *cs;
				1248	int err;
				1249
				1250	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
				1251	if (!cs)
				1252	return -ENOMEM;
				1253
				1254	down(&cpuset_sem);
				1255	refresh_mems();
				1256	cs->flags = 0;
				1257	if (notify_on_release(parent))
				1258	set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
				1259	cs->cpus_allowed = CPU_MASK_NONE;
				1260	cs->mems_allowed = NODE_MASK_NONE;
				1261	atomic_set(&cs->count, 0);
				1262	INIT_LIST_HEAD(&cs->sibling);
				1263	INIT_LIST_HEAD(&cs->children);
				1264	atomic_inc(&cpuset_mems_generation);
				1265	cs->mems_generation = atomic_read(&cpuset_mems_generation);
				1266
				1267	cs->parent = parent;
				1268
				1269	list_add(&cs->sibling, &cs->parent->children);
				1270
				1271	err = cpuset_create_dir(cs, name, mode);
				1272	if (err < 0)
				1273	goto err;
				1274
				1275	/*
				1276	* Release cpuset_sem before cpuset_populate_dir() because it
				1277	* will down() this new directory's i_sem and if we race with
				1278	* another mkdir, we might deadlock.
				1279	*/
				1280	up(&cpuset_sem);
				1281
				1282	err = cpuset_populate_dir(cs->dentry);
				1283	/* If err < 0, we have a half-filled directory - oh well ;) */
				1284	return 0;
				1285	err:
				1286	list_del(&cs->sibling);
				1287	up(&cpuset_sem);
				1288	kfree(cs);
				1289	return err;
				1290	}
				1291
				1292	static int cpuset_mkdir(struct inode dir, struct dentry dentry, int mode)
				1293	{
				1294	struct cpuset *c_parent = dentry->d_parent->d_fsdata;
				1295
				1296	/* the vfs holds inode->i_sem already */
				1297	return cpuset_create(c_parent, dentry->d_name.name, mode \| S_IFDIR);
				1298	}
				1299
				1300	static int cpuset_rmdir(struct inode unused_dir, struct dentry dentry)
				1301	{
				1302	struct cpuset *cs = dentry->d_fsdata;
				1303	struct dentry *d;
				1304	struct cpuset *parent;
				1305
				1306	/* the vfs holds both inode->i_sem already */
				1307
				1308	down(&cpuset_sem);
				1309	refresh_mems();
				1310	if (atomic_read(&cs->count) > 0) {
				1311	up(&cpuset_sem);
				1312	return -EBUSY;
				1313	}
				1314	if (!list_empty(&cs->children)) {
				1315	up(&cpuset_sem);
				1316	return -EBUSY;
				1317	}
				1318	spin_lock(&cs->dentry->d_lock);
				1319	parent = cs->parent;
				1320	set_bit(CS_REMOVED, &cs->flags);
				1321	list_del(&cs->sibling); /* delete my sibling from parent->children */
				1322	if (list_empty(&parent->children))
				1323	check_for_release(parent);
				1324	d = dget(cs->dentry);
				1325	cs->dentry = NULL;
				1326	spin_unlock(&d->d_lock);
				1327	cpuset_d_remove_dir(d);
				1328	dput(d);
				1329	up(&cpuset_sem);
				1330	return 0;
				1331	}
				1332
				1333	/**
				1334	* cpuset_init - initialize cpusets at system boot
				1335	*
				1336	* Description: Initialize top_cpuset and the cpuset internal file system,
				1337	**/
				1338
				1339	int __init cpuset_init(void)
				1340	{
				1341	struct dentry *root;
				1342	int err;
				1343
				1344	top_cpuset.cpus_allowed = CPU_MASK_ALL;
				1345	top_cpuset.mems_allowed = NODE_MASK_ALL;
				1346
				1347	atomic_inc(&cpuset_mems_generation);
				1348	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
				1349
				1350	init_task.cpuset = &top_cpuset;
				1351
				1352	err = register_filesystem(&cpuset_fs_type);
				1353	if (err < 0)
				1354	goto out;
				1355	cpuset_mount = kern_mount(&cpuset_fs_type);
				1356	if (IS_ERR(cpuset_mount)) {
				1357	printk(KERN_ERR "cpuset: could not mount!\n");
				1358	err = PTR_ERR(cpuset_mount);
				1359	cpuset_mount = NULL;
				1360	goto out;
				1361	}
				1362	root = cpuset_mount->mnt_sb->s_root;
				1363	root->d_fsdata = &top_cpuset;
				1364	root->d_inode->i_nlink++;
				1365	top_cpuset.dentry = root;
				1366	root->d_inode->i_op = &cpuset_dir_inode_operations;
				1367	err = cpuset_populate_dir(root);
				1368	out:
				1369	return err;
				1370	}
				1371
				1372	/**
				1373	* cpuset_init_smp - initialize cpus_allowed
				1374	*
				1375	* Description: Finish top cpuset after cpu, node maps are initialized
				1376	**/
				1377
				1378	void __init cpuset_init_smp(void)
				1379	{
				1380	top_cpuset.cpus_allowed = cpu_online_map;
				1381	top_cpuset.mems_allowed = node_online_map;
				1382	}
				1383
				1384	/**
				1385	* cpuset_fork - attach newly forked task to its parents cpuset.
				1386	* @p: pointer to task_struct of forking parent process.
				1387	*
				1388	* Description: By default, on fork, a task inherits its
				1389	* parents cpuset. The pointer to the shared cpuset is
				1390	* automatically copied in fork.c by dup_task_struct().
				1391	* This cpuset_fork() routine need only increment the usage
				1392	* counter in that cpuset.
				1393	**/
				1394
				1395	void cpuset_fork(struct task_struct *tsk)
				1396	{
				1397	atomic_inc(&tsk->cpuset->count);
				1398	}
				1399
				1400	/**
				1401	* cpuset_exit - detach cpuset from exiting task
				1402	* @tsk: pointer to task_struct of exiting process
				1403	*
				1404	* Description: Detach cpuset from @tsk and release it.
				1405	*
Paul Jackson	2efe86b	2005-05-27 02:02:43 -0700	[diff] [blame^]	1406	* Note that cpusets marked notify_on_release force every task
				1407	* in them to take the global cpuset_sem semaphore when exiting.
				1408	* This could impact scaling on very large systems. Be reluctant
				1409	* to use notify_on_release cpusets where very high task exit
				1410	* scaling is required on large systems.
				1411	*
				1412	* Don't even think about derefencing 'cs' after the cpuset use
				1413	* count goes to zero, except inside a critical section guarded
				1414	* by the cpuset_sem semaphore. If you don't hold cpuset_sem,
				1415	* then a zero cpuset use count is a license to any other task to
				1416	* nuke the cpuset immediately.
				1417	*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1418	**/
				1419
				1420	void cpuset_exit(struct task_struct *tsk)
				1421	{
				1422	struct cpuset *cs;
				1423
				1424	task_lock(tsk);
				1425	cs = tsk->cpuset;
				1426	tsk->cpuset = NULL;
				1427	task_unlock(tsk);
				1428
Paul Jackson	2efe86b	2005-05-27 02:02:43 -0700	[diff] [blame^]	1429	if (notify_on_release(cs)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1430	down(&cpuset_sem);
Paul Jackson	2efe86b	2005-05-27 02:02:43 -0700	[diff] [blame^]	1431	if (atomic_dec_and_test(&cs->count))
				1432	check_for_release(cs);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1433	up(&cpuset_sem);
Paul Jackson	2efe86b	2005-05-27 02:02:43 -0700	[diff] [blame^]	1434	} else {
				1435	atomic_dec(&cs->count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1436	}
				1437	}
				1438
				1439	/**
				1440	* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
				1441	* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
				1442	*
				1443	* Description: Returns the cpumask_t cpus_allowed of the cpuset
				1444	* attached to the specified @tsk. Guaranteed to return some non-empty
				1445	* subset of cpu_online_map, even if this means going outside the
				1446	* tasks cpuset.
				1447	**/
				1448
Benoit Boissinot	9a84889	2005-04-16 15:25:59 -0700	[diff] [blame]	1449	cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1450	{
				1451	cpumask_t mask;
				1452
				1453	down(&cpuset_sem);
				1454	task_lock((struct task_struct *)tsk);
				1455	guarantee_online_cpus(tsk->cpuset, &mask);
				1456	task_unlock((struct task_struct *)tsk);
				1457	up(&cpuset_sem);
				1458
				1459	return mask;
				1460	}
				1461
				1462	void cpuset_init_current_mems_allowed(void)
				1463	{
				1464	current->mems_allowed = NODE_MASK_ALL;
				1465	}
				1466
				1467	/*
				1468	* If the current tasks cpusets mems_allowed changed behind our backs,
				1469	* update current->mems_allowed and mems_generation to the new value.
				1470	* Do not call this routine if in_interrupt().
				1471	*/
				1472
				1473	void cpuset_update_current_mems_allowed(void)
				1474	{
				1475	struct cpuset *cs = current->cpuset;
				1476
				1477	if (!cs)
				1478	return; /* task is exiting */
				1479	if (current->cpuset_mems_generation != cs->mems_generation) {
				1480	down(&cpuset_sem);
				1481	refresh_mems();
				1482	up(&cpuset_sem);
				1483	}
				1484	}
				1485
				1486	void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
				1487	{
				1488	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
				1489	MAX_NUMNODES);
				1490	}
				1491
				1492	/*
				1493	* Are any of the nodes on zonelist zl allowed in current->mems_allowed?
				1494	*/
				1495	int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
				1496	{
				1497	int i;
				1498
				1499	for (i = 0; zl->zones[i]; i++) {
				1500	int nid = zl->zones[i]->zone_pgdat->node_id;
				1501
				1502	if (node_isset(nid, current->mems_allowed))
				1503	return 1;
				1504	}
				1505	return 0;
				1506	}
				1507
				1508	/*
				1509	* Is 'current' valid, and is zone z allowed in current->mems_allowed?
				1510	*/
				1511	int cpuset_zone_allowed(struct zone *z)
				1512	{
				1513	return in_interrupt() \|\|
				1514	node_isset(z->zone_pgdat->node_id, current->mems_allowed);
				1515	}
				1516
				1517	/*
				1518	* proc_cpuset_show()
				1519	* - Print tasks cpuset path into seq_file.
				1520	* - Used for /proc/<pid>/cpuset.
				1521	*/
				1522
				1523	static int proc_cpuset_show(struct seq_file m, void v)
				1524	{
				1525	struct cpuset *cs;
				1526	struct task_struct *tsk;
				1527	char *buf;
				1528	int retval = 0;
				1529
				1530	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
				1531	if (!buf)
				1532	return -ENOMEM;
				1533
				1534	tsk = m->private;
				1535	down(&cpuset_sem);
				1536	task_lock(tsk);
				1537	cs = tsk->cpuset;
				1538	task_unlock(tsk);
				1539	if (!cs) {
				1540	retval = -EINVAL;
				1541	goto out;
				1542	}
				1543
				1544	retval = cpuset_path(cs, buf, PAGE_SIZE);
				1545	if (retval < 0)
				1546	goto out;
				1547	seq_puts(m, buf);
				1548	seq_putc(m, '\n');
				1549	out:
				1550	up(&cpuset_sem);
				1551	kfree(buf);
				1552	return retval;
				1553	}
				1554
				1555	static int cpuset_open(struct inode inode, struct file file)
				1556	{
				1557	struct task_struct *tsk = PROC_I(inode)->task;
				1558	return single_open(file, proc_cpuset_show, tsk);
				1559	}
				1560
				1561	struct file_operations proc_cpuset_operations = {
				1562	.open = cpuset_open,
				1563	.read = seq_read,
				1564	.llseek = seq_lseek,
				1565	.release = single_release,
				1566	};
				1567
				1568	/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
				1569	char cpuset_task_status_allowed(struct task_struct task, char *buffer)
				1570	{
				1571	buffer += sprintf(buffer, "Cpus_allowed:\t");
				1572	buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
				1573	buffer += sprintf(buffer, "\n");
				1574	buffer += sprintf(buffer, "Mems_allowed:\t");
				1575	buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
				1576	buffer += sprintf(buffer, "\n");
				1577	return buffer;
				1578	}