[PATCH] cpuset: memory migration interaction fix Fix memory migration so that it works regardless of what cpuset the invoking task is in. If a task invoked a memory migration, by doing one of: 1) writing a different nodemask to a cpuset 'mems' file, or 2) writing a tasks pid to a different cpuset's 'tasks' file, where the cpuset had its 'memory_migrate' option turned on, then the allocation of the new pages for the migrated task(s) was constrained by the invoking tasks cpuset. If this task wasn't in a cpuset that allowed the requested memory nodes, the memory migration would happen to some other nodes that were in that invoking tasks cpuset. This was usually surprising and puzzling behaviour: Why didn't the pages move? Why did the pages move -there-? To fix this, temporarilly change the invoking tasks 'mems_allowed' task_struct field to the nodes the migrating tasks is moving to, so that new pages can be allocated there. Signed-off-by: Paul Jackson <pj@sgi.com> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

commit: e4e364e865b382f9d99c7fc230ec2ce7df21257a [log] [tgz]
author: Paul Jackson <pj@sgi.com> Fri Mar 31 02:30:52 2006 -0800
committer: Linus Torvalds <torvalds@g5.osdl.org> Fri Mar 31 12:18:55 2006 -0800
tree: 9ff5ab54a0e40d7ad2b55d3ec48c6e175ebf50c7
parent: 2741a559a01e1ba9bf87285569dc1a104d134ecf [diff] [blame]
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bf42381..72248d1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c

@@ -834,6 +834,55 @@
 }
 
 /*
+ * cpuset_migrate_mm
+ *
+ *    Migrate memory region from one set of nodes to another.
+ *
+ *    Temporarilly set tasks mems_allowed to target nodes of migration,
+ *    so that the migration code can allocate pages on these nodes.
+ *
+ *    Call holding manage_mutex, so our current->cpuset won't change
+ *    during this call, as manage_mutex holds off any attach_task()
+ *    calls.  Therefore we don't need to take task_lock around the
+ *    call to guarantee_online_mems(), as we know no one is changing
+ *    our tasks cpuset.
+ *
+ *    Hold callback_mutex around the two modifications of our tasks
+ *    mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ *    While the mm_struct we are migrating is typically from some
+ *    other task, the task_struct mems_allowed that we are hacking
+ *    is for our current task, which must allocate new pages for that
+ *    migrating memory region.
+ *
+ *    We call cpuset_update_task_memory_state() before hacking
+ *    our tasks mems_allowed, so that we are assured of being in
+ *    sync with our tasks cpuset, and in particular, callbacks to
+ *    cpuset_update_task_memory_state() from nested page allocations
+ *    won't see any mismatch of our cpuset and task mems_generation
+ *    values, so won't overwrite our hacked tasks mems_allowed
+ *    nodemask.
+ */
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+							const nodemask_t *to)
+{
+	struct task_struct *tsk = current;
+
+	cpuset_update_task_memory_state();
+
+	mutex_lock(&callback_mutex);
+	tsk->mems_allowed = *to;
+	mutex_unlock(&callback_mutex);
+
+	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+
+	mutex_lock(&callback_mutex);
+	guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
+	mutex_unlock(&callback_mutex);
+}
+
+/*
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
  * cpusets mems_allowed and mems_generation, and for each
@@ -945,10 +994,8 @@
 		struct mm_struct *mm = mmarray[i];
 
 		mpol_rebind_mm(mm, &cs->mems_allowed);
-		if (migrate) {
-			do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
-							MPOL_MF_MOVE_ALL);
-		}
+		if (migrate)
+			cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
 		mmput(mm);
 	}
 
@@ -1184,7 +1231,7 @@
 	if (mm) {
 		mpol_rebind_mm(mm, &to);
 		if (is_memory_migrate(cs))
-			do_migrate_pages(mm, &from, &to, MPOL_MF_MOVE_ALL);
+			cpuset_migrate_mm(mm, &from, &to);
 		mmput(mm);
 	}
commit	e4e364e865b382f9d99c7fc230ec2ce7df21257a	[log] [tgz]
author	Paul Jackson <pj@sgi.com>	Fri Mar 31 02:30:52 2006 -0800
committer	Linus Torvalds <torvalds@g5.osdl.org>	Fri Mar 31 12:18:55 2006 -0800
tree	9ff5ab54a0e40d7ad2b55d3ec48c6e175ebf50c7
parent	2741a559a01e1ba9bf87285569dc1a104d134ecf [diff] [blame]